diff --git a/.bazelrc b/.bazelrc new file mode 100644 index 0000000..99aacce --- /dev/null +++ b/.bazelrc @@ -0,0 +1,53 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +common --experimental_repo_remote_exec +common --experimental_remote_download_regex='.*\/dataproxy_sdk$|.*\/arrow$' + + +build --incompatible_new_actions_api=false +build --copt=-fdiagnostics-color=always +build --enable_platform_specific_config + +build --cxxopt=-std=c++17 +build --host_cxxopt=-std=c++17 + +build:avx --copt=-mavx +build:avx --host_copt=-mavx +build:avx --copt=-DCHECK_AVX +build:avx --host_copt=-DCHECK_AVX + +# Binary safety flags +build --copt=-fPIC +build --copt=-fstack-protector-strong +build:linux --copt=-Wl,-z,noexecstack +build:macos --copt=-Wa,--noexecstack + +test --keep_going +test --test_output=errors +test --test_timeout=1800 + +# static link runtime libraries on Linux +build:linux --action_env=BAZEL_LINKOPTS=-static-libstdc++:-static-libgcc +build:linux --action_env=BAZEL_LINKLIBS=-l%:libstdc++.a:-l%:libgcc.a + +# platform specific config +# Bazel will automatic pick platform config since we have enable_platform_specific_config set +build:linux --copt=-fopenmp +build:linux --linkopt=-fopenmp +build:macos --copt="-Xpreprocessor -fopenmp" +build:macos --copt=-Wno-unused-command-line-argument +build:macos --features=-supports_dynamic_linker +build:macos --macos_minimum_os=12.0 +build:macos --host_macos_minimum_os=12.0 diff --git a/.bazelversion b/.bazelversion new file mode 100644 index 0000000..4be2c72 --- /dev/null +++ b/.bazelversion @@ -0,0 +1 @@ +6.5.0 \ No newline at end of file diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 0000000..c2a3ae5 --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,103 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: 2.1 + +parameters: + GHA_Actor: + type: string + default: "" + GHA_Action: + type: string + default: "" + GHA_Event: + type: string + default: "" + GHA_Meta: + type: string + default: "" + +executors: + openjdk-executor-17: + docker: + - image: cimg/openjdk:17.0 + +jobs: + build-and-push: + executor: openjdk-executor-17 + steps: + - checkout + - setup_remote_docker + - run: + name: Build Jar + command: make build + - run: + name: Push to Docker Hub + command: | + # login dataproxy dockerhub registry + docker login -u ${DOCKER_DEPLOY_USERNAME} -p ${DOCKER_DEPLOY_TOKEN} + # login dataproxy aliyun registry + docker login -u ${ALIYUN_DOCKER_USERNAME} -p ${ALIYUN_DOCKER_PASSWORD} secretflow-registry.cn-hangzhou.cr.aliyuncs.com + CIRCLETAG=$(echo ${CIRCLE_TAG} | sed 's/v//') + + BUILDER_EXISTS=$( + docker buildx inspect dataproxy_image_buildx >/dev/null 2>&1 + echo $? + ) + + if [ "$BUILDER_EXISTS" -eq 0 ]; then + echo "existing buildx builder: dataproxy_image_buildx" + docker buildx use dataproxy_image_buildx + else + echo "creating new buildx builder: dataproxy_image_buildx" + docker buildx create --name dataproxy_image_buildx --use + fi + + remote_image="secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/dataproxy:latest" + docker buildx build \ + --platform linux/arm64,linux/amd64 \ + --tag "${remote_image}" \ + -f ./build/Dockerfiles/dataproxy.Dockerfile . \ + --push + + remote_image="secretflow/dataproxy:latest" + docker buildx build \ + --platform linux/arm64,linux/amd64 \ + --tag "${remote_image}" \ + -f ./build/Dockerfiles/dataproxy.Dockerfile . \ + --push + + remote_image="secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/dataproxy:${CIRCLETAG}" + docker buildx build \ + --platform linux/arm64,linux/amd64 \ + --tag "${remote_image}" \ + -f ./build/Dockerfiles/dataproxy.Dockerfile . \ + --push + + remote_image="secretflow/dataproxy:${CIRCLETAG}" + docker buildx build \ + --platform linux/arm64,linux/amd64 \ + --tag "${remote_image}" \ + -f ./build/Dockerfiles/dataproxy.Dockerfile . \ + --push + +workflows: + build-deploy: + jobs: + - build-and-push: + filters: + tags: + only: /^v.*/ + branches: + ignore: /.*/ \ No newline at end of file diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..16b3e5e --- /dev/null +++ b/.clang-format @@ -0,0 +1,15 @@ +# Use the Google style in this project. +BasedOnStyle: Google + +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: "^<.*" + Priority: 2 + - Regex: '.*\.pb\.h"$' + Priority: 5 + - Regex: '^"secretflow_serving.*' + Priority: 4 + - Regex: '^".*' + Priority: 3 diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000..4ad05fc --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,78 @@ +Checks: "abseil-cleanup-ctad, + abseil-faster-strsplit-delimiter, + abseil-duration-*, + abseil-no-namespace, + abseil-redundant-strcat-calls, + abseil-str-cat-append, + abseil-string-find-startswith, + abseil-upgrade-duration-conversions + bugprone-*, + -bugprone-easily-swappable-parameters, + -bugprone-implicit-widening-of-multiplication-result, + -bugprone-narrowing-conversions, # too many false positives around `std::size_t` vs. `*::difference_type`. + google-build-using-namespace, + google-explicit-constructor, + google-global-names-in-headers, + google-readability-casting, + google-runtime-int, + google-runtime-operator, + misc-unused-using-decls, + modernize-*, + -modernize-use-trailing-return-type, + -modernize-avoid-c-arrays, + -modernize-return-braced-init-list, # can hurt readability + -modernize-use-nodiscard, + performance-*, + readability-*, + -readability-else-after-return, + -readability-identifier-length, + -readability-function-cognitive-complexity, + -readability-magic-numbers, + -readability-named-parameter" + +CheckOptions: + - key: bugprone-argument-comment.StrictMode + value: 1 + + - key: bugprone-dangling-handle.HandleClasses + value: "std::basic_string_view;std::experimental::basic_string_view;absl::string_view" + + - key: misc-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic + value: 1 + + # Ignore GoogleTest function macros. + - key: readability-identifier-naming.FunctionIgnoredRegexp + value: "(TEST|TEST_F|TEST_P|INSTANTIATE_TEST_SUITE_P|MOCK_METHOD|TYPED_TEST)" + + - key: readability-identifier-naming.ClassCase + value: "CamelCase" + + - key: readability-identifier-naming.EnumCase + value: "CamelCase" + + - key: readability-identifier-naming.EnumConstantCase + value: "CamelCase" + + - key: readability-identifier-naming.ParameterCase + value: "lower_case" + + - key: readability-identifier-naming.PrivateMemberCase + value: "lower_case" + + - key: readability-identifier-naming.PrivateMemberSuffix + value: "_" + + - key: readability-identifier-naming.StructCase + value: "CamelCase" + + - key: readability-identifier-naming.TypeAliasCase + value: "CamelCase" + + - key: readability-identifier-naming.UnionCase + value: "CamelCase" + + - key: readability-identifier-naming.FunctionCase + value: "CamelBack" + + - key: performance-unnecessary-value-param.AllowedTypes + value: PtBufferView diff --git a/.github/workflows/cla.yml b/.github/workflows/cla.yml new file mode 100644 index 0000000..926e495 --- /dev/null +++ b/.github/workflows/cla.yml @@ -0,0 +1,11 @@ +--- +name: CLA Assistant +on: + issue_comment: + types: [created] + pull_request_target: + types: [opened, closed, synchronize] +jobs: + CLAssistant: + uses: secretflow/.github/.github/workflows/cla.yml@main + secrets: inherit diff --git a/.gitignore b/.gitignore index 524f096..fa71c8f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,24 +1,60 @@ -# Compiled class file -*.class - -# Log file -*.log - -# BlueJ files -*.ctxt - -# Mobile Tools for Java (J2ME) -.mtj.tmp/ - -# Package Files # -*.jar -*.war -*.nar -*.ear -*.zip -*.tar.gz -*.rar - -# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml -hs_err_pid* -replay_pid* +HELP.md +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/**/target/ +!**/src/test/**/target/ +**/.DS_Store + +### STS ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### IntelliJ IDEA ### +.idea +*.iws +*.iml +*.ipr + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +!**/src/main/**/build/ +!**/src/test/**/build/ +dataproxy-api/**/kusciaapi/ +dataproxy-api/**/common/ + +### VS Code ### +.vscode/ + +tmp/ +db/ +log/ +build/**/settings.xml +docs/_build + +.java-version +dataproxy_sdk/MODULE.bazel +dataproxy_sdk/MODULE.bazel.lock + +# clangd cache +.cache +external + +# cmake related +abseil-cpp +bld +bld.install +CMakeCache.txt +cmake_install.cmake +CTestTestfile.cmake + +# bazel +bazel-* diff --git a/BUILD.bazel b/BUILD.bazel new file mode 100644 index 0000000..8289c83 --- /dev/null +++ b/BUILD.bazel @@ -0,0 +1,13 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/LEGAL.md b/LEGAL.md new file mode 100644 index 0000000..f968920 --- /dev/null +++ b/LEGAL.md @@ -0,0 +1,7 @@ +Legal Disclaimer + +Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail. + +法律免责声明 + +关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。 \ No newline at end of file diff --git a/LICENSE b/LICENSE index 261eeb9..f49a4e1 100644 --- a/LICENSE +++ b/LICENSE @@ -198,4 +198,4 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and - limitations under the License. + limitations under the License. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100755 index 0000000..9d4963b --- /dev/null +++ b/Makefile @@ -0,0 +1,43 @@ +# Setting SHELL to bash allows bash commands to be executed by recipes. +# Options are set to exit when a recipe line exits non-zero or a piped command fails. +SHELL = /usr/bin/env bash -o pipefail +.SHELLFLAGS = -ec + +.PHONY: all +all: build + +##@ General + +# The help target prints out all targets with their descriptions organized +# beneath their categories. The categories are represented by '##@' and the +# target descriptions by '##'. The awk commands is responsible for reading the +# entire set of makefiles included in this invocation, looking for lines of the +# file as xyz: ## something, and then pretty-format the target and help. Then, +# if there's a line with ##@ something, that gets pretty-printed as a category. +# More info on the usage of ANSI control characters for terminal formatting: +# https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters +# More info on the awk command: +# http://linuxcommand.org/lc3_adv_awk.php + +.PHONY: help +help: ## Display this help. + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-16s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + + +##@ Development + +.PHONY: test +test: ## Run tests. + mvn clean test + +.PHONY: build +build: ## Build DataProxy binary whether to integrate frontend. + ./scripts/build.sh + +.PHONY: image +image: build ## Build docker image with the manager. + ./scripts/build_image.sh + +.PHONY: docs +docs: ## Build docs. + cd docs && pip install -r requirements.txt && make html \ No newline at end of file diff --git a/README.md b/README.md index 986b22c..f5923c0 100644 --- a/README.md +++ b/README.md @@ -1 +1,25 @@ -# dataproxy \ No newline at end of file +# DataProxy + +

+简体中文English +

+ +DataProxy is a data service framework based on [Arrow Flight](https://arrow.apache.org/docs/format/Flight.html) that +accesses rich data sources and provides unified, easy-to-use, efficient, and robust data reading and writing services. +With DataProxy: + +* You can access various types of data sources, including MySQL, S3, Aliyun OSS, local disk, etc. +* You can use a consistent read/write interface to realize read/write operations on different data sources. + +## Documentation + +Currently, we only provide detailed documentations in Chinese. + +- [Development](./docs/development/build_dataproxy_cn.md) + +## Disclaimer + +Non-release version of DataProxy is only for demonstration and should not be used in production environments. +Although this version of DataProxy covers the basic abilities, there may be some security issues and functional defects +due to insufficient functionality and unfinished items in the project. +We welcome your active suggestions and look forward to the official release. \ No newline at end of file diff --git a/README.zh-CN.md b/README.zh-CN.md new file mode 100644 index 0000000..9c53b87 --- /dev/null +++ b/README.zh-CN.md @@ -0,0 +1,18 @@ +# DataProxy + +

+简体中文English +

+ +DataProxy 是一个基于 [Arrow Flight](https://arrow.apache.org/docs/format/Flight.html) 的数据服务框架,接入丰富的数据源,提供统一、易用、高效、健壮的数据读写服务。通过 DataProxy: + +* 你可以接入丰富的数据源,其中包括 MySQL、S3、Aliyun OSS、本地磁盘等 +* 你可以使用统一的接口来实现对不同数据源的读写操作 + +## Documentation + +- [Development](./docs/development/build_dataproxy_cn) + +## 声明 + +非正式发布的 DataProxy 版本仅用于演示,请勿在生产环境中使用。尽管此版本已涵盖 DataProxy 的基础功能,但由于项目存在功能不足和待完善项,可能存在部分安全问题和功能缺陷。因此,我们欢迎你积极提出建议,并期待正式版本的发布。 \ No newline at end of file diff --git a/WORKSPACE b/WORKSPACE new file mode 100644 index 0000000..0696718 --- /dev/null +++ b/WORKSPACE @@ -0,0 +1,72 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +workspace(name = "dataproxy") + +load("@dataproxy//dataproxy_sdk/bazel:repositories.bzl", "dataproxy_deps") + +dataproxy_deps() + +load("@yacl//bazel:repositories.bzl", "yacl_deps") + +yacl_deps() + +load( + "@rules_foreign_cc//foreign_cc:repositories.bzl", + "rules_foreign_cc_dependencies", +) + +rules_foreign_cc_dependencies( + register_built_tools = False, + register_default_tools = False, + register_preinstalled_tools = True, +) + +# +# boost +# +load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps") + +boost_deps() + +load("@rules_python//python:repositories.bzl", "py_repositories") + +py_repositories() + +load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies") + +rules_pkg_dependencies() + +load("@pybind11_bazel//:python_configure.bzl", "python_configure") + +python_configure( + name = "local_config_python", + python_version = "3", +) + +load("@rules_proto_grpc//:repositories.bzl", "rules_proto_grpc_repos", "rules_proto_grpc_toolchains") + +rules_proto_grpc_toolchains() + +rules_proto_grpc_repos() + +# Load gRPC dependencies after load. +load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps") + +grpc_deps() + +# Load extra gRPC dependencies due to https://github.com/grpc/grpc/issues/20511 +load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps") + +grpc_extra_deps() diff --git a/build/Dockerfiles/dataproxy.Dockerfile b/build/Dockerfiles/dataproxy.Dockerfile new file mode 100644 index 0000000..cc9aae9 --- /dev/null +++ b/build/Dockerfiles/dataproxy.Dockerfile @@ -0,0 +1,14 @@ +FROM secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/secretpad-base-lite:0.3 + +ENV LANG=C.UTF-8 +WORKDIR /app + +# fix: RunP proot + java bug +RUN ln -s ${JAVA_HOME}/lib/libjli.so /lib64 + +COPY target/*.jar dataproxy.jar +COPY config/application.yaml application.yaml +COPY scripts/start_dp.sh start_dp.sh +ENV JAVA_OPTS="" SPRING_PROFILES_ACTIVE="default" +EXPOSE 8023 +ENTRYPOINT ${JAVA_HOME}/bin/java ${JAVA_OPTS} -Dsun.net.http.allowRestrictedHeaders=true --add-opens=java.base/java.nio=ALL-UNNAMED -jar -Dspring.profiles.active=${SPRING_PROFILES_ACTIVE} ./dataproxy.jar \ No newline at end of file diff --git a/config/application.yaml b/config/application.yaml new file mode 100644 index 0000000..080afba --- /dev/null +++ b/config/application.yaml @@ -0,0 +1,26 @@ +spring: + # profiles: + # active: local + autoconfigure: + exclude: org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration + application: + name: dataproxy + servlet: + multipart: + max-file-size: -1 + max-request-size: -1 + file-size-threshold: -1 + +logging: + level: + root: info + file: + path: "./logs" + +dataproxy: + flight: + host: 127.0.0.1 # getFlightInfo 返回的endpoint ip + port: 8023 + ticket: + timeout: 300 # 过期时间,单位秒 + onlyOnce: true # 是否一次性,true:一次性使用,false:允许多次调用,超时销毁 \ No newline at end of file diff --git a/dataproxy-api/pom.xml b/dataproxy-api/pom.xml new file mode 100644 index 0000000..9d29f4f --- /dev/null +++ b/dataproxy-api/pom.xml @@ -0,0 +1,49 @@ + + + 4.0.0 + + org.secretflow + dataproxy + 0.0.1-SNAPSHOT + + + dataproxy-api + + + + com.google.protobuf + protobuf-java + + + com.google.protobuf + protobuf-java-util + + + org.apache.arrow + flight-grpc + + + io.grpc + grpc-netty-shaded + + + javax.annotation + javax.annotation-api + + + + + + + org.xolstice.maven.plugins + protobuf-maven-plugin + + ../proto + + + + + + \ No newline at end of file diff --git a/dataproxy-common/pom.xml b/dataproxy-common/pom.xml new file mode 100644 index 0000000..105c26e --- /dev/null +++ b/dataproxy-common/pom.xml @@ -0,0 +1,91 @@ + + + 4.0.0 + + org.secretflow + dataproxy + 0.0.1-SNAPSHOT + + + dataproxy-common + + + + org.secretflow + dataproxy-api + + + + org.projectlombok + lombok + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + + + org.apache.commons + commons-collections4 + + + org.apache.commons + commons-lang3 + + + commons-io + commons-io + + + io.netty + netty-all + + + io.netty + netty-tcnative-boringssl-static + + + com.squareup.okio + okio + + + + com.google.protobuf + protobuf-java-util + + + + + org.apache.arrow + arrow-vector + + + + org.apache.arrow + arrow-dataset + + + + org.apache.arrow + arrow-memory-netty + + + + org.apache.arrow + arrow-format + + + + org.apache.arrow + flight-core + + + org.apache.arrow + flight-grpc + + + + + + \ No newline at end of file diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/DataproxyErrorCode.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/DataproxyErrorCode.java new file mode 100644 index 0000000..ccb7114 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/DataproxyErrorCode.java @@ -0,0 +1,152 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.exceptions; + +import lombok.Getter; + +/** + * dataproxy error code enums + * + * @author muhong + * @date 2023-09-14 14:32 + */ +@Getter +public enum DataproxyErrorCode { + + SUCCESS(ErrorLevels.INFO, ErrorTypes.BIZ, "000", "success"), + + //============================= 系统错误【001-399】================================== + // saas场景【001-99】 + SAAS_GET_FLIGHT_INFO_READ_ERROR(ErrorLevels.ERROR, ErrorTypes.SYSTEM, "001", "Exception handling request to get data read from the terminal"), + SAAS_GET_FLIGHT_INFO_WRITE_ERROR(ErrorLevels.ERROR, ErrorTypes.SYSTEM, "002", "Exception handling request to get data written to the terminal"), + // kuscia场景【100-199】 + KUSCIA_GET_FLIGHT_INFO_QUERY_ERROR(ErrorLevels.ERROR, ErrorTypes.SYSTEM, "101", "Exception handling request to get data read from the terminal"), + KUSCIA_GET_FLIGHT_INFO_UPDATE_ERROR(ErrorLevels.ERROR, ErrorTypes.SYSTEM, "102", "Exception handling request to get data written to the terminal"), + KUSCIA_GET_STREAM_ERROR(ErrorLevels.ERROR, ErrorTypes.SYSTEM, "103", "Data read error"), + KUSCIA_ACCEPT_PUT_ERROR(ErrorLevels.ERROR, ErrorTypes.SYSTEM, "104", "Data write error"), + + // common 200-399 + DATASET_WRITE_ERROR(ErrorLevels.ERROR, ErrorTypes.SYSTEM, "200", "Data Write Exception"), + CACHE_SERVICE_BEAN_NOT_FOUND(ErrorLevels.ERROR, ErrorTypes.BIZ, "500", "Cannot find Cache Service Bean"), + + + //============================= 业务错误【400-850】================================== + // 通用参数异常400-409 + PARAMS_NOT_EXIST_ERROR(ErrorLevels.ERROR, ErrorTypes.PARAM, "400", "Exception of missing parameter"), + PARAMS_UNRELIABLE(ErrorLevels.ERROR, ErrorTypes.PARAM, "401", "Invalid parameter"), + // 框架异常410-449 + TICKET_UNAVAILABLE(ErrorLevels.ERROR, ErrorTypes.BIZ, "410", "Ticket invalid or expired"), + CREATE_DATASOURCE_CONNECTOR_ERROR(ErrorLevels.ERROR, ErrorTypes.BIZ, "411", "Failed to create data source connector"), + UNSUPPORTED_FIELD_TYPE(ErrorLevels.ERROR, ErrorTypes.PARAM, "412", "Unsupported field type"), + INVALID_PARTITION_SPEC(ErrorLevels.ERROR, ErrorTypes.BIZ, "413", "Invalid partition expression"), + + // jdbc数据源类450-499 + JDBC_DATASOURCE_CONNECTION_POOL_BUILD_ERROR(ErrorLevels.ERROR, ErrorTypes.BIZ, "450", "Failed to create JDBC connection pool"), + JDBC_DATASOURCE_CONNECTION_VALIDATE_FAILED(ErrorLevels.ERROR, ErrorTypes.PARAM, "451", "JDBC data source connectivity test failed"), + JDBC_CALL_ERROR(ErrorLevels.ERROR, ErrorTypes.BIZ, "452", "JDBC data source request failed"), + JDBC_GET_PRIMARY_KEY_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "453", "Failed to infer primary key"), + JDBC_GET_PARTITION_STATS_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "454", "Failed to infer pagination parameters"), + JDBC_FETCH_BATCH_DATA_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "455", "Failed to retrieve data block"), + JDBC_GET_CONN_THREAD_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "456", "Failed to get connection from pool"), + JDBC_CREATE_TABLE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "460", "Failed to create table"), + JDBC_INSERT_INTO_TABLE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "461", "Failed to write data into table"), + UNSUPPORTED_INDEX_TYPE(ErrorLevels.ERROR, ErrorTypes.PARAM, "462", "Unsupported index type"), + FIELD_NOT_EXIST(ErrorLevels.ERROR, ErrorTypes.PARAM, "463", "Field does not exist"), + + // 文件类数据源500-549 + FILE_READ_STREAM_CREATE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "500", "Failed to create file read stream"), + FILE_BATCH_DOWNLOAD_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "501", "Failed to download file data block"), + GET_FILE_SIZE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "502", "Failed to retrieve file size"), + FILE_WRITE_STREAM_CREATE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "503", "Failed to create a file output stream for writing"), + BINARY_DATA_FIELD_NOT_EXIST(ErrorLevels.ERROR, ErrorTypes.PARAM, "503", "The 'binary_data' column does not exist in the binary file for writing"), + READ_DATA_LINE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "504", "Failed to read data row"), + HEADER_LINE_NOT_EXIST(ErrorLevels.ERROR, ErrorTypes.PARAM, "505", "Original file table header does not exist"), + HEADER_LINE_PARSE_FAILED(ErrorLevels.ERROR, ErrorTypes.PARAM, "506", "Table header parsing failed"), + VALUE_LINE_PARSE_FAILED(ErrorLevels.ERROR, ErrorTypes.PARAM, "507", "Failed to parse data row"), + BOM_REMOVE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "508", "Failed to remove BOM header"), + DETECT_ENCODING_FAILED(ErrorLevels.ERROR, ErrorTypes.PARAM, "509", "Failed to infer CSV file encoding format"), + READER_RELEASE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "510", "Failed to release data read stream"), + DATA_FORMAT_CONVERT_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "511", "Failed to convert target data format"), + + // odps 异常 + ODPS_CREATE_TABLE_FAILED(ErrorLevels.ERROR, ErrorTypes.BIZ, "600", "Create ODPS table failed"), + ODPS_ERROR(ErrorLevels.ERROR, ErrorTypes.BIZ, "601", "ODPS error"), + + //============================= 第三方错误【900-999】================================== + + ; + /** + * Error code prefix (2 character) + */ + private final static String ERROR_PREFIX = "DP"; + + /** + * Error version (1 character) + */ + private final static String ERROR_VERSION = "0"; + + /** + * Error scene (4 character) + * 0001 dataproxy + */ + private final static String ERROR_SCENE = "0001"; + + /** + * Error level (1 character) + */ + private final ErrorLevels errorLevel; + + /** + * Error type (1 character) + */ + private final ErrorTypes errorType; + + /** + * Error specific id (3 character) + */ + private final String errorSpecific; + + /** + * Error code (12 character): {@link #ERROR_PREFIX} + {@link #ERROR_VERSION} + {@link #ERROR_SCENE} + {@link #errorLevel} + {@link #errorType} + {@link #errorSpecific} + */ + private final String errorCode; + + /** + * Error message + */ + private final String errorMessage; + + /** + * Construct function + * + * @param errorLevel error level + * @param errorType error type + * @param errorSpecific error specific id + * @param errorMessage error message + */ + DataproxyErrorCode(ErrorLevels errorLevel, + ErrorTypes errorType, + String errorSpecific, + String errorMessage) { + this.errorLevel = errorLevel; + this.errorType = errorType; + this.errorSpecific = errorSpecific; + this.errorMessage = errorMessage; + this.errorCode = ERROR_PREFIX + ERROR_VERSION + ERROR_SCENE + errorLevel.getCode() + + errorType.getCode() + errorSpecific; + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/DataproxyException.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/DataproxyException.java new file mode 100644 index 0000000..3999780 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/DataproxyException.java @@ -0,0 +1,74 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.exceptions; + +import lombok.extern.slf4j.Slf4j; + +/** + * dataproxy exception + * + * @author muhong + * @date 2023-09-14 14:23 + */ +@Slf4j +public class DataproxyException extends RuntimeException { + private final DataproxyErrorCode errorCode; + + public DataproxyException(DataproxyErrorCode errorCode) { + super(errorCode.getErrorMessage()); + this.errorCode = errorCode; + } + + public DataproxyException(DataproxyErrorCode errorCode, String message) { + super(errorCode.getErrorMessage() + ": " + message); + this.errorCode = errorCode; + } + + public DataproxyException(DataproxyErrorCode errorCode, Throwable cause) { + super(errorCode.getErrorMessage(), cause); + this.errorCode = errorCode; + } + + public DataproxyException(DataproxyErrorCode errorCode, String message, Throwable cause) { + super(errorCode.getErrorMessage() + ": " + message, cause); + this.errorCode = errorCode; + } + + public static DataproxyException of(DataproxyErrorCode errorCode) { + return new DataproxyException(errorCode); + } + + public static DataproxyException of(DataproxyErrorCode errorCode, String message) { + return new DataproxyException(errorCode, message); + } + + public static DataproxyException of(DataproxyErrorCode errorCode, Throwable cause) { + return new DataproxyException(errorCode, cause); + } + + public static DataproxyException of(DataproxyErrorCode errorCode, String message, Throwable cause) { + return new DataproxyException(errorCode, message, cause); + } + + public DataproxyErrorCode getErrorCode() { + return errorCode; + } + + public String getDescription() { + return String.format("code: %s, message: %s", getErrorCode().getErrorCode(), getMessage()); + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/ErrorLevels.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/ErrorLevels.java new file mode 100644 index 0000000..84b09f9 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/ErrorLevels.java @@ -0,0 +1,54 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.exceptions; + +import lombok.Getter; + +/** + * error levels + * + * @author muhong + * @date 2023-09-14 14:22 + */ +@Getter +public enum ErrorLevels { + /** + * INFO + */ + INFO("1"), + + /** + * WARN + */ + WARN("3"), + + /** + * ERROR + */ + ERROR("5"), + + /** + * FATAL + */ + FATAL("7"); + + private final String code; + + ErrorLevels(String code) { + this.code = code; + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/ErrorTypes.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/ErrorTypes.java new file mode 100644 index 0000000..90aaab6 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/exceptions/ErrorTypes.java @@ -0,0 +1,54 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.exceptions; + +import lombok.Getter; + +/** + * error types + * + * @author muhong + * @date 2023-09-14 14:23 + */ +@Getter +public enum ErrorTypes { + /** + * System error + */ + SYSTEM("0"), + + /** + * Biz error + */ + BIZ("1"), + + /** + * Third party error + */ + THIRD_PARTY("2"), + + /** + * Param error + */ + PARAM("3"); + + private final String code; + + ErrorTypes(String code) { + this.code = code; + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/FlightContentFormatConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/FlightContentFormatConfig.java new file mode 100644 index 0000000..edfcdb7 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/FlightContentFormatConfig.java @@ -0,0 +1,45 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model; + +import org.secretflow.dataproxy.common.model.dataset.format.FormatConfig; + +import lombok.*; + +/** + * Flight data format + * + * @author muhong + * @date 2023-08-31 11:27 + */ +@Getter +@Setter +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class FlightContentFormatConfig { + + /** + * Format type + */ + private FlightContentFormatTypeEnum formatType; + + /** + * Format content + */ + private FormatConfig formatConfig; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/FlightContentFormatTypeEnum.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/FlightContentFormatTypeEnum.java new file mode 100644 index 0000000..bc182a0 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/FlightContentFormatTypeEnum.java @@ -0,0 +1,32 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model; + +/** + * Flight data format type + * + * @author muhong + * @date 2023-08-31 11:23 + */ +public enum FlightContentFormatTypeEnum { + + STRUCTURED_DATA, + + BINARY_FILE, + + CSV +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/InferSchemaResult.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/InferSchemaResult.java new file mode 100644 index 0000000..aecb069 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/InferSchemaResult.java @@ -0,0 +1,48 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model; + +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; +import org.apache.arrow.vector.types.pojo.Schema; + +/** + * Infer schema result + * + * @author muhong + * @date 2023-09-12 19:36 + */ +@Getter +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class InferSchemaResult { + + /** + * Arrow schema + */ + private Schema schema; + + /** + * Data format config + */ + private DatasetFormatConfig datasetFormatConfig; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/Command.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/Command.java new file mode 100644 index 0000000..e2b42b2 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/Command.java @@ -0,0 +1,45 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.command; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; + +/** + * Data action command + * + * @author muhong + * @date 2023-08-31 11:46 + */ +@Getter +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class Command { + + /** + * Command type + */ + private CommandTypeEnum type; + + /** + * command info + */ + private CommandInfo commandInfo; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/CommandInfo.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/CommandInfo.java new file mode 100644 index 0000000..91d6868 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/CommandInfo.java @@ -0,0 +1,26 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.command; + +/** + * Command info + * + * @author muhong + * @date 2023-08-31 11:09 + */ +public interface CommandInfo { +} \ No newline at end of file diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/CommandTypeEnum.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/CommandTypeEnum.java new file mode 100644 index 0000000..3a5583c --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/CommandTypeEnum.java @@ -0,0 +1,36 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.command; + +/** + * Command type enum + * + * @author muhong + * @date 2023-08-31 11:08 + */ +public enum CommandTypeEnum { + + /** + * Data read + */ + READ, + + /** + * Data write + */ + WRITE +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/DatasetReadCommand.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/DatasetReadCommand.java new file mode 100644 index 0000000..d4b7609 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/DatasetReadCommand.java @@ -0,0 +1,82 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.command; + +import org.secretflow.dataproxy.common.model.FlightContentFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasetLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasourceConnConfig; + +import lombok.*; +import org.apache.arrow.vector.types.pojo.Schema; + +import java.util.List; +import java.util.Map; + +/** + * Data read command content + * + * @author muhong + * @date 2023-08-31 11:06 + */ +@Getter +@Setter +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class DatasetReadCommand implements CommandInfo { + + /** + * Datasource connection config + */ + private DatasourceConnConfig connConfig; + + /** + * The location of the dataset in its datasource + */ + private DatasetLocationConfig locationConfig; + + /** + * Data format config + */ + private DatasetFormatConfig formatConfig; + + /** + * Data arrow schema + */ + private Schema schema; + + /** + * The field name to read (If struct data) + */ + private List fieldList; + + /** + * The filter condition for this read action (If struct data) + */ + private String filter; + + /** + * Data output format config + */ + private FlightContentFormatConfig outputFormatConfig; + + /** + * extra options + */ + private Map extraOptions; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/DatasetWriteCommand.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/DatasetWriteCommand.java new file mode 100644 index 0000000..96cb08e --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/command/DatasetWriteCommand.java @@ -0,0 +1,71 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.command; + +import org.secretflow.dataproxy.common.model.FlightContentFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasetLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasourceConnConfig; + +import lombok.*; +import org.apache.arrow.vector.types.pojo.Schema; + +import java.util.Map; + +/** + * 数据存储指令 + * + * @author muhong + * @date 2023-08-31 11:31 + */ +@Getter +@Setter +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class DatasetWriteCommand implements CommandInfo { + + /** + * Datasource connection config + */ + private DatasourceConnConfig connConfig; + + /** + * The location of the dataset in its datasource + */ + private DatasetLocationConfig locationConfig; + + /** + * Data format config + */ + private DatasetFormatConfig formatConfig; + + /** + * Data arrow schema + */ + private Schema schema; + + /** + * Data input format config + */ + private FlightContentFormatConfig inputFormatConfig; + + /** + * extra options + */ + private Map extraOptions; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DataSceneEnum.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DataSceneEnum.java new file mode 100644 index 0000000..6709ab3 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DataSceneEnum.java @@ -0,0 +1,41 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset; + +/** + * Dataset scene + * + * @author muhong + * @date 2023-08-30 19:21 + */ +public enum DataSceneEnum { + + /** + * Local dataset + */ + LOCAL_DATASET, + + /** + * Job result + */ + RESULT, + + /** + * Temp data + */ + TEMP, +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/Dataset.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/Dataset.java new file mode 100644 index 0000000..b71f9e1 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/Dataset.java @@ -0,0 +1,84 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset; + +import org.secretflow.dataproxy.common.model.datasource.DatasetLocationConfig; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; + +import java.util.Map; + +/** + * Dataset + * + * @author muhong + * @date 2023-08-30 19:20 + */ +@Getter +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class Dataset { + + /** + * Dataset unique id + */ + private String datasetId; + + /** + * Dataset name + */ + private String name; + + /** + * Dataset description + */ + private String description; + + /** + * Dataset scene + */ + private DataSceneEnum dataScene; + + /** + * Dataset location in its datasource + */ + private DatasetLocationConfig locationConfig; + + /** + * Dataset format config + */ + private DatasetFormatConfig formatConfig; + + /** + * Dataset schema + */ + private DatasetSchema schema; + + /** + * Dataset owner id + */ + private String ownerId; + + /** + * Attributes + */ + private Map attributes; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DatasetFormatConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DatasetFormatConfig.java new file mode 100644 index 0000000..e0c7b61 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DatasetFormatConfig.java @@ -0,0 +1,48 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset; + +import org.secretflow.dataproxy.common.model.dataset.format.DatasetFormatTypeEnum; +import org.secretflow.dataproxy.common.model.dataset.format.FormatConfig; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * Dataset format config + * + * @author muhong + * @date 2023-08-30 19:20 + */ +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class DatasetFormatConfig { + + /** + * Format type + */ + private DatasetFormatTypeEnum type; + + /** + * Format content + */ + private FormatConfig formatConfig; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DatasetSchema.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DatasetSchema.java new file mode 100644 index 0000000..f20e07b --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/DatasetSchema.java @@ -0,0 +1,54 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset; + +import org.secretflow.dataproxy.common.model.dataset.schema.DatasetSchemaTypeEnum; +import org.secretflow.dataproxy.common.model.dataset.schema.FastDFSchema; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.apache.arrow.vector.types.pojo.Schema; + +/** + * 数据集数据结构 + * + * @author muhong + * @date 2023-08-30 19:21 + */ +@Data +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class DatasetSchema { + + /** + * 数据集数据结构类型 + */ + private DatasetSchemaTypeEnum type; + + /** + * 数据结构 + */ + private FastDFSchema schema; + + /** + * arrow 类型 + */ + private Schema arrowSchema; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/CSVFormatConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/CSVFormatConfig.java new file mode 100644 index 0000000..7adcf39 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/CSVFormatConfig.java @@ -0,0 +1,66 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.format; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.NoArgsConstructor; + +import java.util.Map; + +/** + * CSV format config + * + * @author muhong + * @date 2023-08-30 19:32 + */ +@Getter +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class CSVFormatConfig implements FormatConfig { + + /** + * Field name map, key: raw name, value:output name + */ + Map fieldMap; + + /** + * With header line + */ + @Builder.Default + private Boolean withHeaderLine = true; + + /** + * Separator + */ + @Builder.Default + private String separator = ","; + + /** + * QuoteChar + */ + @Builder.Default + private String quoteChar = "\""; + + /** + * EscapeChar + */ + @Builder.Default + private String escapeChar = "\\"; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/DatasetFormatTypeEnum.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/DatasetFormatTypeEnum.java new file mode 100644 index 0000000..d4f4944 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/DatasetFormatTypeEnum.java @@ -0,0 +1,41 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.format; + +/** + * Dataset format type enum + * + * @author muhong + * @date 2023-08-31 15:06 + */ +public enum DatasetFormatTypeEnum { + + /** + * Table + */ + TABLE, + + /** + * CSV file + */ + CSV, + + /** + * Binary file + */ + BINARY_FILE +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/FormatConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/FormatConfig.java new file mode 100644 index 0000000..ded33f6 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/FormatConfig.java @@ -0,0 +1,26 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.format; + +/** + * Dataset format config + * @author muhong + * @date 2023-08-30 19:24 + */ +public interface FormatConfig { + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/IndexType.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/IndexType.java new file mode 100644 index 0000000..70f3247 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/IndexType.java @@ -0,0 +1,35 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.format; + +/** + * Table index type + * @author yumu + * @date 2023/9/4 10:22 + */ +public enum IndexType { + + /** + * unique index + */ + UNIQUE, + + /** + * common index + */ + INDEX +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/PartitionBehavior.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/PartitionBehavior.java new file mode 100644 index 0000000..14f500d --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/PartitionBehavior.java @@ -0,0 +1,73 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.format; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.commons.lang3.StringUtils; + +import java.util.List; + +/** + * Partition behavior + * + * @author muhong + * @date 2023-10-23 15:44 + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class PartitionBehavior { + + /** + * Field name + */ + private String fieldName; + + /** + * Field type + */ + private ArrowType.ArrowTypeID type; + + /** + * Lower bound + */ + private String lowerBound; + + /** + * Upper bound + */ + private String upperBound; + + /** + * Partition step + */ + private String step; + + /** + * Predicates, eg["id>=0 AND id<100", "id>=100 AND id<200", "id>=200 AND id<300"] + */ + private List predicates; + + public boolean isValid() { + return StringUtils.isNotEmpty(fieldName) && type != null && StringUtils.isNotEmpty(lowerBound) && StringUtils.isNotEmpty(upperBound); + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/TableFormatConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/TableFormatConfig.java new file mode 100644 index 0000000..30fce07 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/TableFormatConfig.java @@ -0,0 +1,56 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.format; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.List; +import java.util.Map; + +/** + * Table format config + * @author muhong + * @date 2023-08-30 19:36 + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder(toBuilder = true) +public class TableFormatConfig implements FormatConfig { + /** + * Primary key + */ + private String primaryKey; + + /** + * Index list + */ + private List indexList; + + /** + * Partition behavior + */ + private PartitionBehavior partitionBehavior; + + /** + * Field name map + */ + private Map fieldMap; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/TableIndex.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/TableIndex.java new file mode 100644 index 0000000..81d0529 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/format/TableIndex.java @@ -0,0 +1,52 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.format; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.List; + +/** + * Table index + * + * @author yumu + * @date 2023/9/4 10:17 + */ +@Data +@NoArgsConstructor +@AllArgsConstructor +@Builder +public class TableIndex { + + /** + * Index name + */ + private String indexName; + + /** + * Index type + */ + private IndexType type; + + /** + * Index field name list + */ + private List field; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DataField.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DataField.java new file mode 100644 index 0000000..0eb4bcb --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DataField.java @@ -0,0 +1,53 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.schema; + +import lombok.*; + +/** + * 数据列信息 + * + * @author muhong + * @date 2023-08-30 19:38 + */ +@Getter +@Setter +@Builder +@AllArgsConstructor +@NoArgsConstructor +public class DataField { + + /** + * 字段名称 + */ + private String name; + + /** + * 字段描述 + */ + private String description; + + /** + * 字段类型 + */ + private DataFieldTypeEnum type; + + /** + * 是否可为空 + */ + private Boolean nullable; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DataFieldTypeEnum.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DataFieldTypeEnum.java new file mode 100644 index 0000000..4e8cc25 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DataFieldTypeEnum.java @@ -0,0 +1,34 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.schema; + +/** + * 结构化数据字段类型枚举 + * + * @author muhong + * @date 2023-08-31 15:13 + */ +public enum DataFieldTypeEnum { + + STRING, + + INTEGER, + + DOUBLE, + + BOOLEAN, +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DatasetSchemaTypeEnum.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DatasetSchemaTypeEnum.java new file mode 100644 index 0000000..cb4da85 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/DatasetSchemaTypeEnum.java @@ -0,0 +1,36 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.schema; + +/** + * 数据结构 + * + * @author muhong + * @date 2023-08-31 15:09 + */ +public enum DatasetSchemaTypeEnum { + + /** + * 结构化数据 + */ + STRUCTURED_DATA, + + /** + * 二进制数据 + */ + BINARY, +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/FastDFSchema.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/FastDFSchema.java new file mode 100644 index 0000000..6cff9e6 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/FastDFSchema.java @@ -0,0 +1,25 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.schema; + +/** + * @author muhong + * @date 2023-08-30 19:31 + */ +public interface FastDFSchema { + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/StructuredDataSchema.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/StructuredDataSchema.java new file mode 100644 index 0000000..7c54108 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/dataset/schema/StructuredDataSchema.java @@ -0,0 +1,42 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.dataset.schema; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.List; + +/** + * 结构化数据schema + * + * @author muhong + * @date 2023-08-30 19:38 + */ +@Data +@AllArgsConstructor +@NoArgsConstructor +@Builder +public class StructuredDataSchema implements FastDFSchema { + + /** + * 字段列表 + */ + private List fieldList; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasetLocationConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasetLocationConfig.java new file mode 100644 index 0000000..31372b3 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasetLocationConfig.java @@ -0,0 +1,52 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource; + +import org.secretflow.dataproxy.common.model.datasource.location.LocationConfig; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * Dataset location config + * + * @author muhong + * @date 2023-08-30 19:18 + */ +@Data +@AllArgsConstructor +@NoArgsConstructor +@Builder +public class DatasetLocationConfig { + + /** + * Datasource id + */ + private String datasourceId; + + /** + * Datasource type + */ + private DatasourceTypeEnum type; + + /** + * Location in its datasource + */ + private LocationConfig locationConfig; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/Datasource.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/Datasource.java new file mode 100644 index 0000000..b815382 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/Datasource.java @@ -0,0 +1,71 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +import java.util.Map; + +/** + * Datasource + * + * @author yumu + * @date 2023/8/30 16:12 + */ +@Data +@AllArgsConstructor +@NoArgsConstructor +@Builder +public class Datasource { + /** + * Datasource unique id + */ + private String datasourceId; + + /** + * Datasource name + */ + private String name; + + /** + * Datasource description + */ + private String description; + + /** + * Datasource connection config + */ + private DatasourceConnConfig connConfig; + + /** + * Writable + */ + private Boolean writable; + + /** + * Owner id + */ + private String ownerId; + + /** + * Attributes + */ + private Map attributes; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceConnConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceConnConfig.java new file mode 100644 index 0000000..f71ef14 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceConnConfig.java @@ -0,0 +1,52 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; +import org.secretflow.dataproxy.common.model.datasource.conn.ConnConfig; +import org.secretflow.dataproxy.common.utils.IdUtils; +import org.secretflow.dataproxy.common.utils.JsonUtils; + +/** + * Datasource connection config + * + * @author yumu + * @date 2023/8/30 16:30 + */ +@Data +@AllArgsConstructor +@NoArgsConstructor +@Builder +public class DatasourceConnConfig { + + /** + * Datasource type + */ + private DatasourceTypeEnum type; + + /** + * Datasource connection config content + */ + private ConnConfig connConfig; + + public String generateUniqueId() { + return IdUtils.combineIds(JsonUtils.toJSONString(this)); + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceSchemaConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceSchemaConfig.java new file mode 100644 index 0000000..507acdf --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceSchemaConfig.java @@ -0,0 +1,28 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource; + +/** + * 数据源结构信息父类 + * + * @author muhong + * @date 2023-08-09 10:18 AM + */ +public class DatasourceSchemaConfig { + + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceTypeEnum.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceTypeEnum.java new file mode 100644 index 0000000..c084418 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/DatasourceTypeEnum.java @@ -0,0 +1,55 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource; + +import lombok.Getter; + +/** + * Datasource type enum + * + * @author yumu + * @date 2023/8/29 14:39 + */ +@Getter +public enum DatasourceTypeEnum { + + /** + * Local filesystem + */ + LOCAL_HOST("file:///"), + + MYSQL, + + OSS("oss://"), + MINIO("s3a://"), + OBS("obs://"), + + /** + * max compute + */ + ODPS, + ; + + private String scheme; + + DatasourceTypeEnum() { + } + + DatasourceTypeEnum(String scheme) { + this.scheme = scheme; + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/ConnConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/ConnConfig.java new file mode 100644 index 0000000..ad22c96 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/ConnConfig.java @@ -0,0 +1,26 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.conn; + +/** + * Datasource connection config content + * + * @author yumu + * @date 2023/8/30 16:24 + */ +public interface ConnConfig { +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/JdbcBaseConnConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/JdbcBaseConnConfig.java new file mode 100644 index 0000000..6f6823d --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/JdbcBaseConnConfig.java @@ -0,0 +1,88 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.conn; + +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Getter; +import lombok.Setter; +import lombok.NoArgsConstructor; +import lombok.experimental.SuperBuilder; +import org.secretflow.dataproxy.common.serializer.SensitiveDataSerializer; + +import java.util.Map; + +/** + * JDBC datasource connection config + * + * @author muhong + * @date 2023-09-07 14:06 + */ +@Getter +@Setter +@AllArgsConstructor +@NoArgsConstructor +@SuperBuilder +public class JdbcBaseConnConfig implements ConnConfig { + + /** + * Host + */ + private String host; + + /** + * Dataset + */ + private String database; + + /** + * Username + */ + @JsonSerialize(using = SensitiveDataSerializer.class) + private String userName; + + /** + * Password + */ + @JsonSerialize(using = SensitiveDataSerializer.class) + private String password; + + /** + * Options + */ + private Map option; + + @Builder.Default + private Integer maximumPoolSize = 10; + + @Builder.Default + private Integer minimumIdle = 2; + + @Builder.Default + private Boolean cachePrepStmts = true; + + @Builder.Default + private Boolean useServerPrepStmts = true; + + @Builder.Default + private Integer prepStmtCacheSize = 200; + + @Builder.Default + private Integer prepStmtCacheSqlLimit = 2048; + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/LocalFileSystemConnConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/LocalFileSystemConnConfig.java new file mode 100644 index 0000000..631cd19 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/LocalFileSystemConnConfig.java @@ -0,0 +1,42 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.conn; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import lombok.experimental.SuperBuilder; + +/** + * Local filesystem datasource connection config + * + * @author muhong + * @date 2023-09-13 11:46 + */ +@Getter +@Setter +@AllArgsConstructor +@NoArgsConstructor +@SuperBuilder +public class LocalFileSystemConnConfig implements ConnConfig { + + /** + * Path + */ + private String path; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/MinioConnConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/MinioConnConfig.java new file mode 100644 index 0000000..3fc62cb --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/MinioConnConfig.java @@ -0,0 +1,36 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.conn; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.SuperBuilder; + +/** + * Minio datasource connection config + * + * @author yumu + * @date 2023/8/30 16:48 + */ +@Getter +@Setter +@SuperBuilder +@AllArgsConstructor +public class MinioConnConfig extends ObjectFileSystemConnConfig { + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/MysqlConnConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/MysqlConnConfig.java new file mode 100644 index 0000000..6b83719 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/MysqlConnConfig.java @@ -0,0 +1,36 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.conn; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.SuperBuilder; + +/** + * MySQL datasource connection config + * + * @author yumu + * @date 2023/8/30 16:36 + */ +@Getter +@Setter +@SuperBuilder +@AllArgsConstructor +public class MysqlConnConfig extends JdbcBaseConnConfig { + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/ObjectFileSystemConnConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/ObjectFileSystemConnConfig.java new file mode 100644 index 0000000..5df02a5 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/ObjectFileSystemConnConfig.java @@ -0,0 +1,76 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.conn; + +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import lombok.experimental.SuperBuilder; +import org.secretflow.dataproxy.common.serializer.SensitiveDataSerializer; + +/** + * Oss datasource connection config + * + * @author muhong + * @date 2023-09-11 11:34 + */ +@Getter +@Setter +@AllArgsConstructor +@NoArgsConstructor +@SuperBuilder +public class ObjectFileSystemConnConfig implements ConnConfig { + + /** + * 地址 + */ + private String endpoint; + + /** + * 访问秘钥 + */ + @JsonSerialize(using = SensitiveDataSerializer.class) + private String accessKey; + + /** + * ak的密码 + */ + @JsonSerialize(using = SensitiveDataSerializer.class) + private String accessSecret; + + /** + * 通信协议,http 或 https + */ + private String endpointProtocol; + + /** + * 区域域名,不带 host 和 protocol + */ + private String regionHost; + + /** + * bucket + */ + private String bucket; + + /** + * 对象 key 前缀,为 prefix/ 形式 + */ + private String objectKeyPrefix; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/OdpsConnConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/OdpsConnConfig.java new file mode 100644 index 0000000..c923526 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/conn/OdpsConnConfig.java @@ -0,0 +1,47 @@ +package org.secretflow.dataproxy.common.model.datasource.conn; + +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import jakarta.validation.constraints.NotBlank; +import lombok.*; +import org.secretflow.dataproxy.common.serializer.SensitiveDataSerializer; + +/** + * connection configuration of odps + * + * @author yuexie + * @date 2024-05-30 10:30:20 + */ +@Getter +@Setter +@Builder +@AllArgsConstructor +@NoArgsConstructor +@ToString +public class OdpsConnConfig implements ConnConfig { + + /** + * access key id + */ + @NotBlank + @JsonSerialize(using = SensitiveDataSerializer.class) + private String accessKeyId; + + /** + * access key secret + */ + @NotBlank + @JsonSerialize(using = SensitiveDataSerializer.class) + private String accessKeySecret; + + /** + * endpoint + */ + @NotBlank + private String endpoint; + + /** + * project name + */ + private String projectName; + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/FileSystemLocationConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/FileSystemLocationConfig.java new file mode 100644 index 0000000..23c98cd --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/FileSystemLocationConfig.java @@ -0,0 +1,42 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.location; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import lombok.experimental.SuperBuilder; + +/** + * File system dataset location config + * + * @author muhong + * @date 2023-09-11 10:58 + */ +@Getter +@Setter +@SuperBuilder +@AllArgsConstructor +@NoArgsConstructor +public class FileSystemLocationConfig implements LocationConfig { + + /** + * Relative path + */ + private String relativePath; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/JdbcLocationConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/JdbcLocationConfig.java new file mode 100644 index 0000000..e3a066a --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/JdbcLocationConfig.java @@ -0,0 +1,42 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.location; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.NoArgsConstructor; +import lombok.Setter; +import lombok.experimental.SuperBuilder; + +/** + * JDBC dataset location config + * + * @author muhong + * @date 2023-09-07 20:52 + */ +@Getter +@Setter +@SuperBuilder +@AllArgsConstructor +@NoArgsConstructor +public class JdbcLocationConfig implements LocationConfig { + + /** + * table name + */ + private String table; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/LocationConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/LocationConfig.java new file mode 100644 index 0000000..7b41c3d --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/LocationConfig.java @@ -0,0 +1,27 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.location; + +/** + * Dataset location config + * + * @author muhong + * @date 2023-08-30 19:15 + */ +public interface LocationConfig { + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/MinioLocationConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/MinioLocationConfig.java new file mode 100644 index 0000000..6eee4ab --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/MinioLocationConfig.java @@ -0,0 +1,35 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.location; + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.SuperBuilder; + +/** + * Minio dataset location config + * + * @author muhong + * @date 2023-08-30 19:15 + */ +@Getter +@Setter +@SuperBuilder +@AllArgsConstructor +public class MinioLocationConfig extends FileSystemLocationConfig { +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/MysqlLocationConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/MysqlLocationConfig.java new file mode 100644 index 0000000..2bc4794 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/MysqlLocationConfig.java @@ -0,0 +1,37 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.location; + + +import lombok.AllArgsConstructor; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.SuperBuilder; + +/** + * Mysql dataset location config + * + * @author muhong + * @date 2023-08-30 19:16 + */ +@Getter +@Setter +@SuperBuilder +@AllArgsConstructor +public class MysqlLocationConfig extends JdbcLocationConfig { + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/OSSLocationConfig.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/OSSLocationConfig.java new file mode 100644 index 0000000..f05787e --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/OSSLocationConfig.java @@ -0,0 +1,38 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.model.datasource.location; + +import lombok.AllArgsConstructor; +import lombok.Builder; +import lombok.Data; +import lombok.NoArgsConstructor; + +/** + * Oss dataset location config + * @author yumu + * @date 2023/9/1 17:32 + */ +@Data +@AllArgsConstructor +@NoArgsConstructor +@Builder +public class OSSLocationConfig implements LocationConfig { + /** + * 文件key + */ + private String fileKey; +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/OdpsTableInfo.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/OdpsTableInfo.java new file mode 100644 index 0000000..11c85c3 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/model/datasource/location/OdpsTableInfo.java @@ -0,0 +1,23 @@ +package org.secretflow.dataproxy.common.model.datasource.location; + +import org.secretflow.v1alpha1.common.Common; +import org.secretflow.v1alpha1.kusciaapi.Domaindata; + +import java.util.List; + +public record OdpsTableInfo(String tableName, String partitionSpec, List fields) implements LocationConfig { + + public static OdpsTableInfo fromKusciaData(Domaindata.DomainData domainData) { + + if (domainData.hasPartition() && !domainData.getPartition().getFieldsList().isEmpty()) { + return new OdpsTableInfo(domainData.getRelativeUri(), domainData.getPartition().getFields(0).getName(), transformFields(domainData.getColumnsList())); + } + + return new OdpsTableInfo(domainData.getRelativeUri(), "", transformFields(domainData.getColumnsList())); + } + + private static List transformFields(List columnList) { + return columnList.stream().map(Common.DataColumn::getName).toList(); + } + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/serializer/SensitiveDataSerializer.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/serializer/SensitiveDataSerializer.java new file mode 100644 index 0000000..8525bf9 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/serializer/SensitiveDataSerializer.java @@ -0,0 +1,21 @@ +package org.secretflow.dataproxy.common.serializer; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.SerializerProvider; + +import java.io.IOException; + +/** + * json 序列化字段脱敏器 + * + * @author yuexie + * @date 2024-07-08 + */ +public class SensitiveDataSerializer extends JsonSerializer { + + @Override + public void serialize(String s, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException { + jsonGenerator.writeString("***"); + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/serializer/package-info.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/serializer/package-info.java new file mode 100644 index 0000000..fdc2e10 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/serializer/package-info.java @@ -0,0 +1,7 @@ +/** + * JSON 序列化工具类包 + * @author yuexie + * @date 2024-07-08 + * @version 1.0.0 + */ +package org.secretflow.dataproxy.common.serializer; diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/DPStringUtils.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/DPStringUtils.java new file mode 100644 index 0000000..c5767a1 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/DPStringUtils.java @@ -0,0 +1,64 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.utils; + +import org.apache.commons.lang3.StringUtils; + +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * @author muhong + * @date 2023-10-19 11:13 + */ +public class DPStringUtils { + + /** + * 去前后包装标识 + * + * @param origin 原始字符串 + * @param identifier 包装标识 + * @return + */ + public static String removeDecorateIdentifier(String origin, String identifier) { + String removeStart = StringUtils.removeStart(origin, identifier); + return StringUtils.removeEnd(removeStart, identifier); + } + + /** + * 忽略空值拼接 + * + * @param delimiter 间隔符 + * @param array 待拼接数组 + * @return + */ + public static String joinWithoutEmpty(String delimiter, String... array) { + if (array == null || array.length == 0) { + return ""; + } + + List notEmptyList = Arrays.stream(array).filter(Objects::nonNull).collect(Collectors.toList()); + if (notEmptyList.isEmpty()) { + return ""; + } + + return StringUtils.join(notEmptyList, delimiter); + } + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/GrpcUtils.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/GrpcUtils.java new file mode 100644 index 0000000..4d33b52 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/GrpcUtils.java @@ -0,0 +1,79 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.utils; + +import com.google.protobuf.Any; +import com.google.protobuf.InvalidProtocolBufferException; +import com.google.protobuf.Message; +import org.apache.arrow.flight.CallStatus; + +/** + * Grpc utils + * + * @author muhong + * @date 2023-08-23 16:06 + */ +public class GrpcUtils { + + /** + * Helper to parse {@link com.google.protobuf.Any} objects to the specific protobuf object. + * + * @param source the raw bytes source value. + * @return the materialized protobuf object. + */ + public static Any parseOrThrow(byte[] source) { + try { + return Any.parseFrom(source); + } catch (final InvalidProtocolBufferException e) { + throw CallStatus.INVALID_ARGUMENT + .withDescription("Received invalid message from remote.") + .withCause(e) + .toRuntimeException(); + } + } + + /** + * Helper to unpack {@link com.google.protobuf.Any} objects to the specific protobuf object. + * + * @param source the parsed Source value. + * @param as the class to unpack as. + * @param the class to unpack as. + * @return the materialized protobuf object. + */ + public static T unpackOrThrow(Any source, Class as) { + try { + return source.unpack(as); + } catch (final InvalidProtocolBufferException e) { + throw CallStatus.INVALID_ARGUMENT + .withDescription("Provided message cannot be unpacked as " + as.getName() + ": " + e) + .withCause(e) + .toRuntimeException(); + } + } + + /** + * Helper to parse and unpack {@link com.google.protobuf.Any} objects to the specific protobuf object. + * + * @param source the raw bytes source value. + * @param as the class to unpack as. + * @param the class to unpack as. + * @return the materialized protobuf object. + */ + public static T unpackAndParseOrThrow(byte[] source, Class as) { + return unpackOrThrow(parseOrThrow(source), as); + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/IdUtils.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/IdUtils.java new file mode 100644 index 0000000..6cfeb71 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/IdUtils.java @@ -0,0 +1,137 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.utils; + +import okio.ByteString; +import org.apache.commons.lang3.StringUtils; + +import java.text.SimpleDateFormat; +import java.util.Base64; +import java.util.Date; +import java.util.Random; +import java.util.UUID; + +/** + * @author chengyuan.mc + * @date 2021/9/2 11:57 上午 + **/ +public class IdUtils { + private static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss"); + private static final Random random = new Random(); + + /** + * 随机字母表 + */ + private static final String idLetters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + + /** + * 随机串长度 + */ + private static final int idLen = 8; + + /** + * 生成ID + * + * @param prefix, 前缀 + * @param splitter, 分隔符 + * @return + */ + public static String createId(String prefix, String splitter) { + String dateText; + synchronized (dateFormat) { + dateText = dateFormat.format(new Date()); + } + return prefix + splitter + dateText + splitter + createRandString(idLen); + } + + /** + * 生成随机字符串 + * + * @return + */ + public static String createRandString(int len) { + char[] idChars = new char[len]; + for (int i = 0; i < len; i++) { + idChars[i] = idLetters.charAt(random.nextInt(idLetters.length())); + } + return new String(idChars); + } + + /** + * 生成随机uuid + * + * @return + */ + public static String randomUUID() { + return UUID.randomUUID().toString().replace("-", ""); + } + + /** + * 多个id生成联合id + * + * @param ids + * @return + */ + public static String combineIds(String... ids) { + return ByteString + .encodeUtf8(StringUtils.join(ids, "|")) + .sha256() + .hex(); + } + + /** + * 拼接两个 id 作为 traceId + */ + public static String concatIds(String id1, String id2) { + return id1 + "|" + id2; + } + + /** + * 拆分出 traceId 的两个 id + */ + public static String[] splitIds(String str) { + return str.split("|"); + } + + /** + * 压缩 uuid 长度 + */ + public static String compressUUID(String uuid) { + String hex = uuid.replace("-", ""); + byte[] bytes = hex2Bytes(hex); + return Base64.getEncoder().withoutPadding().encodeToString(bytes); + } + + private static byte[] hex2Bytes(String hex) { + if (hex == null || hex.isEmpty()) { + return new byte[0]; + } + byte[] bytes = hex.getBytes(); + int n = bytes.length >> 1; + byte[] buf = new byte[n]; + for (int i = 0; i < n; i++) { + int index = i << 1; + buf[i] = (byte) ((byte2Int(bytes[index]) << 4) | byte2Int(bytes[index + 1])); + } + return buf; + } + + private static int byte2Int(byte b) { + return (b <= '9') ? b - '0' : b - 'a' + 10; + } + +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/JsonUtils.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/JsonUtils.java new file mode 100644 index 0000000..624fe8d --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/JsonUtils.java @@ -0,0 +1,417 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.utils; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.TreeNode; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.*; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import com.fasterxml.jackson.datatype.jsr310.deser.LocalDateDeserializer; +import com.fasterxml.jackson.datatype.jsr310.deser.LocalDateTimeDeserializer; +import com.fasterxml.jackson.datatype.jsr310.deser.LocalTimeDeserializer; +import com.fasterxml.jackson.datatype.jsr310.ser.LocalDateSerializer; +import com.fasterxml.jackson.datatype.jsr310.ser.LocalDateTimeSerializer; +import com.fasterxml.jackson.datatype.jsr310.ser.LocalTimeSerializer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.reflect.Type; +import java.text.SimpleDateFormat; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.format.DateTimeFormatter; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.TimeZone; + +/** + * Json utils + * + * @author yansi + * @date 2023/5/10 + */ +public class JsonUtils { + + private final static Logger LOGGER = LoggerFactory.getLogger(JsonUtils.class); + + private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"); + private static final DateTimeFormatter DATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd"); + private static final DateTimeFormatter TIME_FORMATTER = DateTimeFormatter.ofPattern("HH:mm:ss"); + + private static final ObjectMapper OM = new ObjectMapper(); + + /** + * Set ObjectMapper config + */ + static { + OM.setSerializationInclusion(JsonInclude.Include.NON_NULL); + OM.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, true); + OM.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, true); + OM.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, true); + OM.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, true); + OM.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + OM.configure(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS, false); + OM.configure(SerializationFeature.WRITE_DURATIONS_AS_TIMESTAMPS, false); + OM.configure(MapperFeature.SORT_PROPERTIES_ALPHABETICALLY, true); + OM.configure(SerializationFeature.ORDER_MAP_ENTRIES_BY_KEYS, true); + OM.setDateFormat(new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")); + JavaTimeModule javaTimeModule = new JavaTimeModule(); + javaTimeModule.addSerializer(LocalDateTime.class, new LocalDateTimeSerializer(DATE_TIME_FORMATTER)); + javaTimeModule.addSerializer(LocalDate.class, new LocalDateSerializer(DATE_FORMATTER)); + javaTimeModule.addSerializer(LocalTime.class, new LocalTimeSerializer(TIME_FORMATTER)); + javaTimeModule.addDeserializer(LocalDateTime.class, new LocalDateTimeDeserializer(DATE_TIME_FORMATTER)); + javaTimeModule.addDeserializer(LocalDate.class, new LocalDateDeserializer(DATE_FORMATTER)); + javaTimeModule.addDeserializer(LocalTime.class, new LocalTimeDeserializer(TIME_FORMATTER)); + OM.registerModule(javaTimeModule); + OM.setTimeZone(TimeZone.getDefault()); + } + + /** + * Make java type with parametrized and parameterClasses + * + * @param parametrized + * @param parameterClasses + * @return JavaType + */ + public static JavaType makeJavaType(Class parametrized, Class... parameterClasses) { + return OM.getTypeFactory().constructParametricType(parametrized, parameterClasses); + } + + /** + * Make java type with rawType and parameterTypes + * + * @param rawType + * @param parameterTypes + * @return JavaType + */ + public static JavaType makeJavaType(Class rawType, JavaType... parameterTypes) { + return OM.getTypeFactory().constructParametricType(rawType, parameterTypes); + } + + /** + * Convert object to json string + * + * @param value + * @return String + */ + public static String toString(Object value) { + if (Objects.isNull(value)) { + return null; + } + if (value instanceof String) { + return (String) value; + } + return toJSONString(value); + } + + /** + * Convert object to json string with writeValueAsString + * + * @param value + * @return String + * @throws RuntimeException + */ + public static String toJSONString(Object value) { + try { + return OM.writeValueAsString(value); + } catch (JsonProcessingException e) { + LOGGER.error("object to json failed, error is {}", e); + throw new RuntimeException(e); + } + } + + /** + * Convert object to json string with writerWithDefaultPrettyPrinter + * + * @param value + * @return String + * @throws RuntimeException + */ + public static String toPrettyString(Object value) { + try { + return OM.writerWithDefaultPrettyPrinter().writeValueAsString(value); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + + /** + * Convert object to JsonNode + * + * @param value + * @return JsonNode + */ + public static JsonNode fromJavaObject(Object value) { + JsonNode result = null; + if (Objects.nonNull(value) && (value instanceof String)) { + result = parseObject((String) value); + } else { + result = OM.valueToTree(value); + } + return result; + } + + /** + * Convert string to JsonNode + * + * @param content + * @return JsonNode + */ + public static JsonNode parseObject(String content) { + try { + return OM.readTree(content); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + + /** + * Get JsonNode from name + * + * @param node + * @param name + * @return JsonNode + */ + public static JsonNode getJsonElement(JsonNode node, String name) { + return node.get(name); + } + + /** + * Get JsonNode from index + * + * @param node + * @param index + * @return JsonNode + */ + public static JsonNode getJsonElement(JsonNode node, int index) { + return node.get(index); + } + + /** + * Convert TreeNode to java target class + * + * @param node + * @param clazz + * @param + * @return target class + * @throws RuntimeException + */ + public static T toJavaObject(TreeNode node, Class clazz) { + try { + return OM.treeToValue(node, clazz); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + + /** + * Convert TreeNode to java target class with javaType + * + * @param node + * @param javaType + * @param + * @return target class + */ + public static T toJavaObject(TreeNode node, JavaType javaType) { + return OM.convertValue(node, javaType); + } + + /** + * Convert TreeNode to java target class with typeReference + * + * @param node + * @param typeReference + * @param + * @return target class + */ + public static T toJavaObject(TreeNode node, TypeReference typeReference) { + return OM.convertValue(node, typeReference); + } + + /** + * Convert TreeNode to java target class with type + * + * @param node + * @param type + * @param + * @return target class + */ + public static T toJavaObject(TreeNode node, Type type) { + return toJavaObject(node, OM.constructType(type)); + } + + /** + * Convert TreeNode to java target class list + * + * @param node + * @param clazz + * @param + * @return target class list + */ + public static List toJavaList(TreeNode node, Class clazz) { + return toJavaObject(node, makeJavaType(List.class, clazz)); + } + + /** + * Convert TreeNode to object list + * + * @param node + * @return object list + */ + public static List toJavaList(TreeNode node) { + return toJavaObject(node, new TypeReference>() { + }); + } + + /** + * Convert TreeNode to java target class map + * + * @param node + * @param clazz + * @param + * @return target class map + */ + public static Map toJavaMap(TreeNode node, Class clazz) { + return toJavaObject(node, makeJavaType(Map.class, String.class, clazz)); + } + + /** + * Convert TreeNode to java target class map + * + * @param node + * @return target class map + */ + public static Map toJavaMap(TreeNode node) { + return toJavaObject(node, new TypeReference>() { + }); + } + + /** + * Convert string to java target class + * + * @param content + * @param clazz + * @param + * @return target class + * @throws RuntimeException + */ + public static T toJavaObject(String content, Class clazz) { + try { + return OM.readValue(content, clazz); + } catch (JsonProcessingException e) { + LOGGER.error("json to object failed, json is {}, error is {}", content, e); + throw new RuntimeException(e); + } + } + + /** + * Convert string to java target class with javaType + * + * @param content + * @param javaType + * @param + * @return target class + * @throws RuntimeException + */ + public static T toJavaObject(String content, JavaType javaType) { + try { + return OM.readValue(content, javaType); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + + /** + * Convert string to java target class with typeReference + * + * @param content + * @param typeReference + * @param + * @return target class + * @throws RuntimeException + */ + public static T toJavaObject(String content, TypeReference typeReference) { + try { + return OM.readValue(content, typeReference); + } catch (JsonProcessingException e) { + throw new RuntimeException(e); + } + } + + /** + * Convert string to java target class with type + * + * @param content + * @param type + * @param + * @return target class + */ + public static T toJavaObject(String content, Type type) { + return toJavaObject(content, OM.constructType(type)); + } + + /** + * Convert string to java target class list + * + * @param content + * @param clazz + * @param + * @return target class list + */ + public static List toJavaList(String content, Class clazz) { + return toJavaObject(content, makeJavaType(List.class, clazz)); + } + + /** + * Convert string to java target class list + * + * @param content + * @return target class list + */ + public static List toJavaList(String content) { + return toJavaObject(content, new TypeReference>() { + }); + } + + /** + * Convert content to java target class map + * + * @param content + * @param clazz + * @param + * @return target class map + */ + public static Map toJavaMap(String content, Class clazz) { + return toJavaObject(content, makeJavaType(Map.class, String.class, clazz)); + } + + /** + * Convert content to java target class map + * + * @param content + * @return target class map + */ + public static Map toJavaMap(String content) { + return toJavaObject(content, new TypeReference>() { + }); + } +} diff --git a/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/ProtoBufJsonUtils.java b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/ProtoBufJsonUtils.java new file mode 100644 index 0000000..9784031 --- /dev/null +++ b/dataproxy-common/src/main/java/org/secretflow/dataproxy/common/utils/ProtoBufJsonUtils.java @@ -0,0 +1,47 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.common.utils; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; + +import com.google.protobuf.Message; +import com.google.protobuf.util.JsonFormat; + +/** + * Pb utils + * + * @author huanyu.wty(焕羽) + * @date 2022/04/25 + */ +public class ProtoBufJsonUtils { + + /** + * Pb message to json string + *
- Sensitive Information Risk + * @param sourceMessage pb message + * @return json string + */ + public static String toJSONString(Message sourceMessage) { + try { + return JsonFormat.printer().print(sourceMessage); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, e); + } + } + +} diff --git a/dataproxy-integration-tests/pom.xml b/dataproxy-integration-tests/pom.xml new file mode 100644 index 0000000..b780f66 --- /dev/null +++ b/dataproxy-integration-tests/pom.xml @@ -0,0 +1,21 @@ + + + 4.0.0 + + org.secretflow + dataproxy + 0.0.1-SNAPSHOT + + + dataproxy-integration-tests + + + + org.secretflow + dataproxy-common + + + + \ No newline at end of file diff --git a/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/DPFlightClient.java b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/DPFlightClient.java new file mode 100644 index 0000000..12fa2d2 --- /dev/null +++ b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/DPFlightClient.java @@ -0,0 +1,103 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.integration.tests; + +import org.secretflow.dataproxy.common.utils.GrpcUtils; + +import com.google.protobuf.Any; +import com.google.protobuf.Message; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.flight.*; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.secretflow.v1alpha1.kusciaapi.Flightdm; + +import java.io.ByteArrayOutputStream; +import java.nio.channels.Channels; +import java.util.List; + +/** + * 用于连接fastDS的flightServer,使用对应数据服务 + * + * @author yumu + * @date 2023/8/16 17:23 + */ +@Slf4j +public class DPFlightClient { + + private FlightClient flightClient; + + + /** + * 构造函数,构建对应的client连接server + * + * @param location 用于定位flightServer服务的位置,重点在uri + */ + public DPFlightClient(BufferAllocator allocator, Location location) { + this.flightClient = FlightClient.builder() + .allocator(allocator) + .location(location) + .build(); + } + + public FlightInfo getFlightInfo(Message readCmd) { + FlightDescriptor flightDescriptor = FlightDescriptor.command(Any.pack(readCmd).toByteArray()); + return flightClient.getInfo(flightDescriptor); + } + + + /** + * 从server端消费数据,并将下载得到的数据存储至指定位置 + * + * @param flightInfo 唯一定位server端的某组数据,包含schema,下载endpoints等信息 + */ + public void downloadStructDataAndPrint(FlightInfo flightInfo) { + // 1、获取endpoint列表及其ticket信息 + List endpointList = flightInfo.getEndpoints(); + + // 2、针对每一个endpoint,执行数据下载操作 + for (FlightEndpoint endpoint : endpointList) { + try (FlightStream flightStream = flightClient.getStream(endpoint.getTicket())) { + VectorSchemaRoot vectorSchemaRootReceived = flightStream.getRoot(); + while (flightStream.next()) { + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(vectorSchemaRootReceived, null, Channels.newChannel(out))) { + writer.start(); + writer.writeBatch(); + System.out.println(vectorSchemaRootReceived.contentToTSVString()); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + public void uploadAllTypeData(FlightInfo flightInfo, VectorSchemaRoot root) { + final Any acceptPutCommand = GrpcUtils.parseOrThrow(flightInfo.getEndpoints().get(0).getTicket().getBytes()); + Flightdm.TicketDomainDataQuery ticketDomainDataQuery = GrpcUtils.unpackOrThrow(acceptPutCommand, Flightdm.TicketDomainDataQuery.class); + + FlightClient.ClientStreamListener listener = flightClient.startPut( + FlightDescriptor.command(Any.pack(ticketDomainDataQuery).toByteArray()), root, new AsyncPutListener()); + for (int i = 1; i < 10; i++) { + listener.putNext(); + } + listener.completed(); + listener.getResult(); + } +} diff --git a/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/DataproxyKusciaTest.java b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/DataproxyKusciaTest.java new file mode 100644 index 0000000..b5d7559 --- /dev/null +++ b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/DataproxyKusciaTest.java @@ -0,0 +1,94 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.integration.tests; + +import org.apache.arrow.flight.FlightInfo; +import org.apache.arrow.flight.Location; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.secretflow.dataproxy.integration.tests.config.LocalHostKusciaConnectorConfig; +import org.secretflow.dataproxy.integration.tests.config.OssKusciaConnectorConfig; +import org.secretflow.v1alpha1.kusciaapi.Domaindata; +import org.secretflow.v1alpha1.kusciaapi.Domaindatasource; +import org.secretflow.v1alpha1.kusciaapi.Flightdm; +import org.secretflow.v1alpha1.kusciaapi.Flightinner; + +import java.util.Arrays; +import java.util.List; + +/** + * @author muhong + * @date 2023-11-17 11:07 + */ +public class DataproxyKusciaTest { + private final static String DP_HOST = ""; + private final static int DP_PORT = 8023; + + private final static BufferAllocator allocator = new RootAllocator(); + + private final static List configList = Arrays.asList( +// new MysqlKusciaConnectorConfig(), + new OssKusciaConnectorConfig(), + new LocalHostKusciaConnectorConfig() + ); + + public static void main(String[] args) { + for (KusciaConnectorConfig connectorConfig : configList) { + writeTest(connectorConfig.getDatasource(), connectorConfig.getDataset()); + readTest(connectorConfig.getDatasource(), connectorConfig.getDataset()); + } + } + + public static void writeTest(Domaindatasource.DomainDataSource dataSource, Domaindata.DomainData domainData) { + Flightdm.CommandDomainDataUpdate updateCommand = Flightdm.CommandDomainDataUpdate.newBuilder() + .setDomaindataId(domainData.getDomaindataId()) + .setContentType(Flightdm.ContentType.Table) + .build(); + + Flightinner.CommandDataMeshUpdate update = Flightinner.CommandDataMeshUpdate.newBuilder() + .setUpdate(updateCommand) + .setDatasource(dataSource) + .setDomaindata(domainData) + .build(); + + FlightInfo flightInfo = getFlightClient().getFlightInfo(update); + getFlightClientFromFlightInfo(flightInfo).uploadAllTypeData(flightInfo, TestDataUtils.generateKusciaAllTypeData(allocator)); + } + + public static void readTest(Domaindatasource.DomainDataSource dataSource, Domaindata.DomainData domainData) { + Flightinner.CommandDataMeshQuery query = Flightinner.CommandDataMeshQuery.newBuilder() + .setQuery(Flightdm.CommandDomainDataQuery.newBuilder() + .setDomaindataId(domainData.getDomaindataId()) + .setContentType(Flightdm.ContentType.Table) + .build()) + .setDatasource(dataSource) + .setDomaindata(domainData) + .build(); + + FlightInfo flightInfo = getFlightClient().getFlightInfo(query); + getFlightClientFromFlightInfo(flightInfo).downloadStructDataAndPrint(flightInfo); + } + + private static DPFlightClient getFlightClient() { + return new DPFlightClient(allocator, Location.forGrpcInsecure(DP_HOST, DP_PORT)); + } + + private static DPFlightClient getFlightClientFromFlightInfo(FlightInfo flightInfo) { + Location location = flightInfo.getEndpoints().get(0).getLocations().get(0); + return new DPFlightClient(allocator, location); + } +} diff --git a/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/KusciaConnectorConfig.java b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/KusciaConnectorConfig.java new file mode 100644 index 0000000..71a13d3 --- /dev/null +++ b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/KusciaConnectorConfig.java @@ -0,0 +1,32 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.integration.tests; + + +import org.secretflow.v1alpha1.kusciaapi.Domaindata; +import org.secretflow.v1alpha1.kusciaapi.Domaindatasource; + +/** + * @author muhong + * @date 2023-11-17 11:03 + */ +public interface KusciaConnectorConfig { + + Domaindatasource.DomainDataSource getDatasource(); + + Domaindata.DomainData getDataset(); +} diff --git a/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/TestDataUtils.java b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/TestDataUtils.java new file mode 100644 index 0000000..689e75e --- /dev/null +++ b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/TestDataUtils.java @@ -0,0 +1,226 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.integration.tests; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.*; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +/** + * @author muhong + * @date 2023-10-08 15:33 + */ +public class TestDataUtils { + + private final static double nullRate = 0.2; + private final static Random random = new Random(System.currentTimeMillis()); + + /** + * 生成kuscia全类型测试数据 + * + * @param allocator + * @return + */ + public static VectorSchemaRoot generateKusciaAllTypeData(BufferAllocator allocator) { + List vectorList = new ArrayList<>(); + int count = 10; + + vectorList.add(createTinyIntVector(allocator, count)); + vectorList.add(createSmallIntVector(allocator, count)); + vectorList.add(createIntVector(allocator, count)); + vectorList.add(createBigIntVector(allocator, count)); + vectorList.add(createFloat4Vector(allocator, count)); + vectorList.add(createFloat8Vector(allocator, count)); + vectorList.add(createBitVector(allocator, count)); + vectorList.add(createVarCharVector(allocator, count)); + vectorList.add(createVarBinaryVector(allocator, count)); + + VectorSchemaRoot result = new VectorSchemaRoot(vectorList); + result.setRowCount(count); + result.syncSchema(); + return result; + } + + public static TinyIntVector createTinyIntVector(BufferAllocator allocator, int count) { + TinyIntVector tinyIntVector = new TinyIntVector("tinyint", allocator); + tinyIntVector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + tinyIntVector.setNull(i); + } else { + tinyIntVector.set(i, i % 128); + } + } + return tinyIntVector; + } + + public static SmallIntVector createSmallIntVector(BufferAllocator allocator, int count) { + SmallIntVector smallIntVector = new SmallIntVector("smallint", allocator); + smallIntVector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + smallIntVector.setNull(i); + } else { + smallIntVector.set(i, i % 32768); + } + } + return smallIntVector; + } + + public static IntVector createIntVector(BufferAllocator allocator, int count) { + IntVector intVector = new IntVector("int", allocator); + intVector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + intVector.setNull(i); + } else { + intVector.set(i, i); + } + } + return intVector; + } + + public static BigIntVector createBigIntVector(BufferAllocator allocator, int count) { + BigIntVector bigIntVector = new BigIntVector("bigInt", allocator); + bigIntVector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + bigIntVector.setNull(i); + } else { + bigIntVector.set(i, i); + } + } + return bigIntVector; + } + + public static UInt1Vector createUInt1Vector(BufferAllocator allocator, int count) { + UInt1Vector uInt1Vector = new UInt1Vector("uint1", allocator); + uInt1Vector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + uInt1Vector.setNull(i); + } else { + uInt1Vector.set(i, i % 256); + } + } + return uInt1Vector; + } + + public static UInt2Vector createUInt2Vector(BufferAllocator allocator, int count) { + UInt2Vector uInt2Vector = new UInt2Vector("uint2", allocator); + uInt2Vector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + uInt2Vector.setNull(i); + } else { + uInt2Vector.set(i, i % 65536); + } + } + return uInt2Vector; + } + + public static UInt4Vector createUInt4Vector(BufferAllocator allocator, int count) { + UInt4Vector uInt4Vector = new UInt4Vector("uint4", allocator); + uInt4Vector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + uInt4Vector.setNull(i); + } else { + uInt4Vector.set(i, i); + } + } + return uInt4Vector; + } + + public static UInt8Vector createUInt8Vector(BufferAllocator allocator, int count) { + UInt8Vector uInt8Vector = new UInt8Vector("unit8", allocator); + uInt8Vector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + uInt8Vector.setNull(i); + } else { + uInt8Vector.set(i, i); + } + } + return uInt8Vector; + } + + public static Float4Vector createFloat4Vector(BufferAllocator allocator, int count) { + Float4Vector float4Vector = new Float4Vector("float4", allocator); + float4Vector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + float4Vector.setNull(i); + } else { + float4Vector.set(i, random.nextFloat()); + } + } + return float4Vector; + } + + public static Float8Vector createFloat8Vector(BufferAllocator allocator, int count) { + Float8Vector float8Vector = new Float8Vector("double", allocator); + float8Vector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + float8Vector.setNull(i); + } else { + float8Vector.set(i, random.nextDouble()); + } + } + return float8Vector; + } + + public static BitVector createBitVector(BufferAllocator allocator, int count) { + BitVector bitVector = new BitVector("bool", allocator); + bitVector.allocateNew(count); + for (int i = 0; i < count; i++) { + bitVector.set(i, random.nextBoolean() ? 1 : 0); + } + return bitVector; + } + + public static VarCharVector createVarCharVector(BufferAllocator allocator, int count) { + VarCharVector varCharVector = new VarCharVector("string", allocator); + varCharVector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + varCharVector.setNull(i); + } else { + varCharVector.set(i, ("string_" + i).getBytes(StandardCharsets.UTF_8)); + } + } + return varCharVector; + } + + public static VarBinaryVector createVarBinaryVector(BufferAllocator allocator, int count) { + VarBinaryVector binaryVector = new VarBinaryVector("binary", allocator); + binaryVector.allocateNew(count); + for (int i = 0; i < count; i++) { + if (random.nextInt() % 100 < (nullRate * 100)) { + binaryVector.setNull(i); + } else { + binaryVector.set(i, ("binary_" + i).getBytes(StandardCharsets.UTF_8)); + } + } + return binaryVector; + } +} \ No newline at end of file diff --git a/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/LocalHostKusciaConnectorConfig.java b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/LocalHostKusciaConnectorConfig.java new file mode 100644 index 0000000..85a4a89 --- /dev/null +++ b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/LocalHostKusciaConnectorConfig.java @@ -0,0 +1,65 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.integration.tests.config; + +import org.secretflow.dataproxy.integration.tests.KusciaConnectorConfig; + +import org.secretflow.v1alpha1.common.Common; +import org.secretflow.v1alpha1.kusciaapi.Domaindata; +import org.secretflow.v1alpha1.kusciaapi.Domaindatasource; + +/** + * @author muhong + * @date 2023-11-17 11:13 + */ +public class LocalHostKusciaConnectorConfig implements KusciaConnectorConfig { + + private final static String TEST_LOCALHOST_ROOT_PATH = "/Users/wubin/work/code/pdcp/pdcpdp/pdcp-pds/test"; + + private final static String TEST_DATASOURCE_ID = "localhost_integration_test_datasource"; + private final static String TEST_DATASET_ID = "localhost_integration_test_dataset"; + private final static String TEST_OWNER = "integration_test_user"; + private final static String TEST_TABLE = "localhost_all_types_write"; + + @Override + public Domaindatasource.DomainDataSource getDatasource() { + return Domaindatasource.DomainDataSource.newBuilder() + .setDatasourceId(TEST_DATASOURCE_ID) + .setName(TEST_DATASOURCE_ID) + .setType("localfs") + .setStatus("Available") + .setInfo(Domaindatasource.DataSourceInfo.newBuilder() + .setLocalfs(Domaindatasource.LocalDataSourceInfo.newBuilder() + .setPath(TEST_LOCALHOST_ROOT_PATH) + .build()) + .build()) + .build(); + } + + @Override + public Domaindata.DomainData getDataset() { + return Domaindata.DomainData.newBuilder() + .setDomaindataId(TEST_DATASET_ID) + .setName(TEST_DATASET_ID) + .setType("table") + .setRelativeUri(TEST_TABLE) + .setDatasourceId(TEST_DATASOURCE_ID) + .setFileFormat(Common.FileFormat.CSV) + .setVendor(TEST_OWNER) + .build(); + } +} diff --git a/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/MysqlKusciaConnectorConfig.java b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/MysqlKusciaConnectorConfig.java new file mode 100644 index 0000000..7bbb904 --- /dev/null +++ b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/MysqlKusciaConnectorConfig.java @@ -0,0 +1,71 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.integration.tests.config; + + +import org.secretflow.dataproxy.integration.tests.KusciaConnectorConfig; + +import org.secretflow.v1alpha1.kusciaapi.Domaindata; +import org.secretflow.v1alpha1.kusciaapi.Domaindatasource; + +/** + * @author muhong + * @date 2023-11-17 11:08 + */ +public class MysqlKusciaConnectorConfig implements KusciaConnectorConfig { + + private final static String MYSQL_HOST = ""; + private final static String MYSQL_PORT = ""; + private final static String DATABASE = ""; + private final static String USERNAME = ""; + private final static String PASSWORD = ""; + + private final static String TEST_DATASOURCE_ID = "mysql_integration_test_datasource"; + private final static String TEST_DATASET_ID = "mysql_integration_test_dataset"; + private final static String TEST_OWNER = "integration_test_user"; + private final static String TEST_TABLE = "mysql_all_types_write"; + + @Override + public Domaindatasource.DomainDataSource getDatasource() { + return Domaindatasource.DomainDataSource.newBuilder() + .setDatasourceId(TEST_DATASOURCE_ID) + .setName(TEST_DATASOURCE_ID) + .setType("mysql") + .setStatus("Available") + .setInfo(Domaindatasource.DataSourceInfo.newBuilder() + .setDatabase(Domaindatasource.DatabaseDataSourceInfo.newBuilder() + .setEndpoint(MYSQL_HOST + ":" + MYSQL_PORT) + .setUser(USERNAME) + .setPassword(PASSWORD) + .setDatabase(DATABASE) + .build()) + .build()) + .build(); + } + + @Override + public Domaindata.DomainData getDataset() { + return Domaindata.DomainData.newBuilder() + .setDomaindataId(TEST_DATASET_ID) + .setName(TEST_DATASET_ID) + .setType("table") + .setRelativeUri(TEST_TABLE) + .setDatasourceId(TEST_DATASOURCE_ID) + .setVendor(TEST_OWNER) + .build(); + } +} diff --git a/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/OssKusciaConnectorConfig.java b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/OssKusciaConnectorConfig.java new file mode 100644 index 0000000..321553b --- /dev/null +++ b/dataproxy-integration-tests/src/main/java/org/secretflow/dataproxy/integration/tests/config/OssKusciaConnectorConfig.java @@ -0,0 +1,74 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.integration.tests.config; + +import org.secretflow.dataproxy.integration.tests.KusciaConnectorConfig; + +import org.secretflow.v1alpha1.common.Common; +import org.secretflow.v1alpha1.kusciaapi.Domaindata; +import org.secretflow.v1alpha1.kusciaapi.Domaindatasource; + +/** + * @author muhong + * @date 2023-11-17 11:08 + */ +public class OssKusciaConnectorConfig implements KusciaConnectorConfig { + + private final static String OSS_HOST = ""; + private final static String OSS_PORT = ""; + private final static String OSS_BUCKET = ""; + private final static String OSS_ACCESS_KEY_ID = ""; + private final static String OSS_ACCESS_KEY_SECRET = ""; + private final static String OSS_STORAGE_TYPE = "minio"; + + private final static String TEST_DATASOURCE_ID = "localhost_integration_test_datasource"; + private final static String TEST_DATASET_ID = "localhost_integration_test_dataset"; + private final static String TEST_OWNER = "integration_test_user"; + private final static String TEST_TABLE = "test/oss_all_types_write"; + + @Override + public Domaindatasource.DomainDataSource getDatasource() { + return Domaindatasource.DomainDataSource.newBuilder() + .setDatasourceId(TEST_DATASOURCE_ID) + .setName(TEST_DATASOURCE_ID) + .setType("oss") + .setStatus("Available") + .setInfo(Domaindatasource.DataSourceInfo.newBuilder() + .setOss(Domaindatasource.OssDataSourceInfo.newBuilder() + .setEndpoint(OSS_HOST + ":" + OSS_PORT) + .setBucket(OSS_BUCKET) + .setAccessKeyId(OSS_ACCESS_KEY_ID) + .setAccessKeySecret(OSS_ACCESS_KEY_SECRET) + .setStorageType(OSS_STORAGE_TYPE) + .build()) + .build()) + .build(); + } + + @Override + public Domaindata.DomainData getDataset() { + return Domaindata.DomainData.newBuilder() + .setDomaindataId(TEST_DATASET_ID) + .setName(TEST_DATASET_ID) + .setType("table") + .setRelativeUri(TEST_TABLE) + .setDatasourceId(TEST_DATASOURCE_ID) + .setFileFormat(Common.FileFormat.CSV) + .setVendor(TEST_OWNER) + .build(); + } +} diff --git a/dataproxy-manager/pom.xml b/dataproxy-manager/pom.xml new file mode 100644 index 0000000..72260a8 --- /dev/null +++ b/dataproxy-manager/pom.xml @@ -0,0 +1,60 @@ + + + 4.0.0 + + org.secretflow + dataproxy + 0.0.1-SNAPSHOT + + + dataproxy-manager + + + + org.secretflow + dataproxy-common + + + + + org.apache.hadoop + hadoop-common + + + org.apache.hadoop + hadoop-aws + + + org.apache.hadoop + hadoop-aliyun + + + com.zaxxer + HikariCP + + + com.mysql + mysql-connector-j + + + + + com.googlecode.juniversalchardet + juniversalchardet + 1.0.3 + + + com.opencsv + opencsv + + + + com.aliyun.odps + odps-sdk-core + + + + + \ No newline at end of file diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/Connector.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/Connector.java new file mode 100644 index 0000000..070ed3d --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/Connector.java @@ -0,0 +1,67 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager; + +import org.secretflow.dataproxy.common.model.InferSchemaResult; +import org.secretflow.dataproxy.common.model.command.DatasetReadCommand; +import org.secretflow.dataproxy.common.model.command.DatasetWriteCommand; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.datasource.location.LocationConfig; + +import org.apache.arrow.memory.BufferAllocator; + +/** + * Datasource connector + * + * @author muhong + * @date 2023-09-01 18:04 + */ +public interface Connector extends AutoCloseable { + + /** + * infer schema + * + * @param allocator Arrow data allocator + * @param locationConfig Dataset location + * @param formatConfig Dataset format + * @return Infer result + */ + InferSchemaResult inferSchema(BufferAllocator allocator, LocationConfig locationConfig, DatasetFormatConfig formatConfig); + + /** + * Build dataset reader + * + * @param allocator Arrow data allocator + * @param readCommand Read command + * @return Reader + */ + DataReader buildReader(BufferAllocator allocator, DatasetReadCommand readCommand); + + /** + * Build dataset writer + * + * @param writeCommand Write command + * @return Writer + */ + DataWriter buildWriter(DatasetWriteCommand writeCommand); + + /** + * Check connector status + * @return + */ + boolean isAvailable(); +} \ No newline at end of file diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/DataReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/DataReader.java new file mode 100644 index 0000000..c509449 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/DataReader.java @@ -0,0 +1,36 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager; + +import java.util.List; + +/** + * Dataset reader + * + * @author muhong + * @date 2023-08-21 17:48 + */ +public interface DataReader { + + /** + * Build split dataset reader + * + * @param splitNumber Split number + * @return Split reader list + */ + List createSplitReader(int splitNumber); +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/DataWriter.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/DataWriter.java new file mode 100644 index 0000000..bfa3b02 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/DataWriter.java @@ -0,0 +1,47 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager; + +import org.apache.arrow.vector.VectorSchemaRoot; + +import java.io.IOException; + +/** + * Dataset writer + * + * @author muhong + * @date 2023-08-21 17:54 + */ +public interface DataWriter extends AutoCloseable { + + /** + * Write a batch + * + * @param root Batch to write + */ + void write(VectorSchemaRoot root) throws IOException; + + /** + * Write the remaining data in the buffer + */ + void flush() throws IOException; + + /** + * Destroy the data + */ + void destroy() throws IOException; +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/SplitReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/SplitReader.java new file mode 100644 index 0000000..e349eda --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/SplitReader.java @@ -0,0 +1,35 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager; + +import org.apache.arrow.vector.ipc.ArrowReader; + +/** + * Split dataset reader + * + * @author muhong + * @date 2023-08-31 17:04 + */ +public interface SplitReader { + + /** + * Get arrow data reader + * + * @return Arrow reader + */ + ArrowReader startRead(); +} \ No newline at end of file diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileDataReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileDataReader.java new file mode 100644 index 0000000..81e5415 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileDataReader.java @@ -0,0 +1,54 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.filesystem; + +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.SplitReader; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.hadoop.fs.FileSystem; + +import java.util.Arrays; +import java.util.List; + +/** + * Binary file data reader + * + * @author yumu + * @date 2023/9/12 19:17 + */ +public class BinaryFileDataReader implements DataReader { + + private final BufferAllocator allocator; + + private final FileSystem fileSystem; + + private final String uri; + + public BinaryFileDataReader(BufferAllocator allocator, + FileSystem fileSystem, + String uri) { + this.allocator = allocator; + this.fileSystem = fileSystem; + this.uri = uri; + } + + @Override + public List createSplitReader(int splitNumber) { + return Arrays.asList(new BinaryFileSplitReader(allocator, fileSystem, uri)); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileDataWriter.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileDataWriter.java new file mode 100644 index 0000000..45b31da --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileDataWriter.java @@ -0,0 +1,90 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.filesystem; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.manager.DataWriter; + +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +/** + * Binary file data writer + * + * @author muhong + * @date 2023-09-13 22:14 + */ +@Slf4j +public class BinaryFileDataWriter implements DataWriter { + + private final String FIELD_NAME = "binary_data"; + + private FSDataOutputStream outputStream; + + public BinaryFileDataWriter(FileSystem fileSystem, String uri) { + // 获取文件写入流 + try { + fileSystem.delete(new Path(uri), true); + this.outputStream = fileSystem.create(new Path(uri)); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.FILE_WRITE_STREAM_CREATE_FAILED, e.getMessage(), e); + } + } + + @Override + public void write(VectorSchemaRoot root) throws IOException { + log.info("[BinaryFileDataWriter-write] received schema:{}", root.getSchema().toJson()); + VarBinaryVector binaryVector = (VarBinaryVector) root.getVector(FIELD_NAME); + if (binaryVector == null) { + throw DataproxyException.of(DataproxyErrorCode.BINARY_DATA_FIELD_NOT_EXIST); + } + log.info("[BinaryFileDataWriter-write] root row count:{}, vector value count: {}", root.getRowCount(), binaryVector.getValueCount()); + for (int row = 0; row < root.getRowCount(); row++) { + byte[] item = binaryVector.get(row); + if (item == null) { + log.info("[BinaryFileDataWriter-write] row:{}, item is null, continue", row); + continue; + } + log.info("[BinaryFileDataWriter-write] row:{}, length:{}, value:\n{}\n", row, item.length, new String(item)); + this.outputStream.write(item); + } + } + + @Override + public void flush() throws IOException { + this.outputStream.flush(); + } + + @Override + public void destroy() throws IOException { + + } + + @Override + public void close() throws Exception { + if (this.outputStream != null) { + this.outputStream.close(); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileSplitReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileSplitReader.java new file mode 100644 index 0000000..c6309c8 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/BinaryFileSplitReader.java @@ -0,0 +1,137 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.filesystem; + +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.manager.SplitReader; + +import java.io.IOException; +import java.util.List; + +/** + * Binary file data split reader + * + * @author muhong + * @date 2023-09-13 21:47 + */ +@Slf4j +public class BinaryFileSplitReader extends ArrowReader implements SplitReader { + + private static final String FIELD_NAME = "binary_data"; + private static final int BATCH_SIZE = 3 * 1024 * 1024; + private final FSDataInputStream inputStream; + + public BinaryFileSplitReader(BufferAllocator allocator, + FileSystem fileSystem, + String uri) { + super(allocator); + + // Generate file input stream + try { + this.inputStream = fileSystem.open(new Path(uri)); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.FILE_READ_STREAM_CREATE_FAILED, e); + } + } + + @Override + public ArrowReader startRead() { + return this; + } + + @Override + public boolean loadNextBatch() throws IOException { + VectorSchemaRoot root = getVectorSchemaRoot(); + root.clear(); + VarBinaryVector binaryVector = (VarBinaryVector) root.getVector(FIELD_NAME); + binaryVector.allocateNew(1); + + // 申请足够空间 + while (binaryVector.getDataBuffer().capacity() < BATCH_SIZE) { + binaryVector.reallocDataBuffer(); + } + + int length = downloadRangeToBuffer(binaryVector.getDataBuffer()); + if (length == 0) { + return false; + } + + binaryVector.getOffsetBuffer().setInt(VarBinaryVector.OFFSET_WIDTH, length); + BitVectorHelper.setBit(binaryVector.getValidityBuffer(), 0); + binaryVector.setLastSet(0); + + root.setRowCount(1); + return true; + } + + @Override + public long bytesRead() { + try { + return this.inputStream.available(); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.GET_FILE_SIZE_FAILED, e); + } + } + + @Override + protected void closeReadSource() throws IOException { + try { + if (this.inputStream != null) { + this.inputStream.close(); + } + } catch (Exception ignored) { + } + } + + @Override + protected Schema readSchema() throws IOException { + return new Schema(List.of(Field.notNullable(FIELD_NAME, new ArrowType.Binary()))); + } + + private int downloadRangeToBuffer(ArrowBuf valueBuffer) { + if (inputStream == null) { + return 0; + } + + try { + if (inputStream.available() == 0) { + return 0; + } + + byte[] bytes = new byte[BATCH_SIZE]; + int length = inputStream.read(bytes); + valueBuffer.writeBytes(bytes, 0, length); + return length; + } catch (IOException e) { + throw DataproxyException.of(DataproxyErrorCode.FILE_BATCH_DOWNLOAD_FAILED, e); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVDataReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVDataReader.java new file mode 100644 index 0000000..f9ac4d5 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVDataReader.java @@ -0,0 +1,66 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.filesystem; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.hadoop.fs.FileSystem; +import org.secretflow.dataproxy.common.model.dataset.format.CSVFormatConfig; +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.SplitReader; + +import java.util.List; + +/** + * CSV file data reader + * + * @author muhong + * @date 2023-09-11 12:00 + */ +public class CSVDataReader implements DataReader { + + private final FileSystem fileSystem; + + private final String uri; + + private final BufferAllocator allocator; + + private final CSVFormatConfig formatConfig; + + private final List fieldList; + + private final Schema schema; + + public CSVDataReader(BufferAllocator allocator, + FileSystem fileSystem, + String uri, + Schema schema, + CSVFormatConfig formatConfig, + List fieldList) { + this.allocator = allocator; + this.fileSystem = fileSystem; + this.uri = uri; + this.schema = schema; + this.formatConfig = formatConfig; + this.fieldList = fieldList; + } + + @Override + public List createSplitReader(int splitNumber) { + return List.of(new CSVSplitReader(allocator, fileSystem, uri, schema, formatConfig, fieldList)); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVDataWriter.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVDataWriter.java new file mode 100644 index 0000000..ffedc0f --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVDataWriter.java @@ -0,0 +1,131 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.filesystem; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.dataset.format.CSVFormatConfig; +import org.secretflow.dataproxy.manager.DataWriter; + +import com.opencsv.CSVParserBuilder; +import com.opencsv.ICSVParser; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +/** + * CSV file data writer + * + * @author muhong + * @date 2023-09-11 12:01 + */ +public class CSVDataWriter implements DataWriter { + + // 数据写状态 + private FSDataOutputStream outputStream; + private ICSVParser rowParser; + private boolean headerWriteFinished = false; + + public CSVDataWriter(FileSystem fileSystem, + String uri, + CSVFormatConfig formatConfig) { + // 配置静态 csv 数据源文件格式解析器 + this.rowParser = new CSVParserBuilder() + .withSeparator(formatConfig.getSeparator().charAt(0)) + .withQuoteChar(formatConfig.getQuoteChar().charAt(0)) + .withEscapeChar(formatConfig.getEscapeChar().charAt(0)) + .build(); + + // 获取文件读取流 + try { + fileSystem.delete(new Path(uri), true); + this.outputStream = fileSystem.create(new Path(uri)); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.FILE_WRITE_STREAM_CREATE_FAILED, e.getLocalizedMessage(), e); + } + } + + @Override + public void write(VectorSchemaRoot root) throws IOException { + + // 表头写入 + if (!headerWriteFinished) { + String[] fieldNames = root.getSchema().getFields().stream().map(Field::getName).toArray(String[]::new); + String headerLine = this.rowParser.parseToLine(fieldNames, false); + this.outputStream.write(headerLine.getBytes(StandardCharsets.UTF_8)); + this.headerWriteFinished = true; + } + + // 数据逐行写入 + for (int row = 0; row < root.getRowCount(); row++) { + String[] values = new String[root.getFieldVectors().size()]; + for (int col = 0; col < root.getFieldVectors().size(); col++) { + values[col] = serialize(root.getVector(col).getObject(row)); + } + + String rowLine = "\n" + this.rowParser.parseToLine(values, false); + this.outputStream.write(rowLine.getBytes(StandardCharsets.UTF_8)); + } + } + + @Override + public void flush() throws IOException { + if (this.outputStream != null) { + this.outputStream.flush(); + } + } + + @Override + public void destroy() throws IOException { + + } + + @Override + public void close() throws Exception { + this.flush(); + + if (this.outputStream != null) { + this.outputStream.close(); + } + } + + /** + * 数据序列化为字符串 + * + * @param value 原始数据 + * @return + */ + private String serialize(Object value) { + // 文本数据无法区分为空内容还是null,序列化为空内容 + if (value == null) { + return ""; + } + + // 字节数组单独处理 + if (value instanceof byte[]) { + return new String((byte[]) value); + } + + // 其余类型数据直接调用toString方法 + return value.toString(); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVSplitReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVSplitReader.java new file mode 100644 index 0000000..1277943 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/CSVSplitReader.java @@ -0,0 +1,464 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.filesystem; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.dataset.format.CSVFormatConfig; +import org.secretflow.dataproxy.manager.SplitReader; + +import com.opencsv.CSVParserBuilder; +import com.opencsv.ICSVParser; +import lombok.extern.slf4j.Slf4j; +import okio.Buffer; +import okio.ByteString; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.*; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.collections4.MapUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.mozilla.universalchardet.Constants; +import org.mozilla.universalchardet.UniversalDetector; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * CSV file data split reader + * + * @author muhong + * @date 2023-09-11 12:02 + */ +@Slf4j +public class CSVSplitReader extends ArrowReader implements SplitReader { + + private static final int FILE_READ_BATCH_SIZE = 3 * 1024 * 1024; + + private static final int ARROW_DATA_ROW_SIZE = 10000; + + + private final FSDataInputStream inputStream; + private Charset charset = null; + private final Buffer buffer; + private final ICSVParser rowParser; + + private boolean finished = false; + + private final Schema schema; + + /** + * Sequential mapping of original data fields to output fields + */ + private final List rawIndexList; + + /** + * Original data header list + */ + private List headerList; + + public CSVSplitReader(BufferAllocator allocator, + FileSystem fileSystem, + String uri, + Schema schema, + CSVFormatConfig formatConfig, + List fieldList) { + super(allocator); + this.buffer = new Buffer(); + // Build CSV parser + this.rowParser = new CSVParserBuilder() + .withSeparator(formatConfig.getSeparator().charAt(0)) + .withQuoteChar(formatConfig.getQuoteChar().charAt(0)) + .withEscapeChar(formatConfig.getEscapeChar().charAt(0)) + .build(); + + // Generate file input stream + try { + this.inputStream = fileSystem.open(new Path(uri)); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.FILE_READ_STREAM_CREATE_FAILED, e.getMessage(), e); + } + + // Parse header + parseHeader(); + + // Infer schema + Map rawToArrowFiledNameMap = MapUtils.isNotEmpty(formatConfig.getFieldMap()) ? formatConfig.getFieldMap() : new HashMap<>(); + if (schema == null) { + schema = new Schema(this.headerList.stream() + .map(rawName -> Field.nullable(rawToArrowFiledNameMap.getOrDefault(rawName, rawName), new ArrowType.Utf8())) + .collect(Collectors.toList()) + ); + } + + // Read by specific order + if (CollectionUtils.isNotEmpty(fieldList)) { + this.schema = new Schema(fieldList.stream().map(schema::findField).collect(Collectors.toList())); + } else { + this.schema = schema; + } + + // Generate sequential mapping of original data fields to output fields + Map arrowToRawFiledNameMap = rawToArrowFiledNameMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); + this.rawIndexList = this.schema.getFields().stream().map(field -> { + String rawFieldName = arrowToRawFiledNameMap.getOrDefault(field.getName(), field.getName()); + return this.headerList.indexOf(rawFieldName); + }).collect(Collectors.toList()); + } + + /** + * Pre allocate memory + * + * @param root Target root + * @param targetSize Target size + */ + private static void preAllocate(VectorSchemaRoot root, int targetSize) { + for (ValueVector vector : root.getFieldVectors()) { + // Only for fixed-length type data + if (vector instanceof BaseFixedWidthVector) { + ((BaseFixedWidthVector) vector).allocateNew(targetSize); + } + } + } + + /** + * Deserialize data and write it into vector + * + * @param vector Vector + * @param index Data col index + * @param value Serialized data + */ + private static void addValueInVector(FieldVector vector, int index, String value) { + if (value == null || StringUtils.isEmpty(value)) { + vector.setNull(index); + return; + } + + try { + switch (vector.getMinorType()) { + case TINYINT: + ((TinyIntVector) vector).setSafe(index, Integer.parseInt(value)); + break; + case SMALLINT: + ((SmallIntVector) vector).setSafe(index, Integer.parseInt(value)); + break; + case INT: + ((IntVector) vector).setSafe(index, Integer.parseInt(value)); + break; + case BIGINT: + ((BigIntVector) vector).setSafe(index, Long.parseLong(value)); + break; + case UINT1: + ((UInt1Vector) vector).setSafe(index, Integer.parseInt(value)); + break; + case UINT2: + ((UInt2Vector) vector).setSafe(index, Integer.parseInt(value)); + break; + case UINT4: + ((UInt4Vector) vector).setSafe(index, Integer.parseInt(value)); + break; + case UINT8: + ((UInt8Vector) vector).setSafe(index, Long.parseLong(value)); + break; + case FLOAT4: + ((Float4Vector) vector).setSafe(index, Float.parseFloat(value)); + break; + case FLOAT8: + ((Float8Vector) vector).setSafe(index, Double.parseDouble(value)); + break; + case BIT: + // Compatible with true/false, 0/1 + if ("true".equalsIgnoreCase(value)) { + ((BitVector) vector).setSafe(index, 1); + } else if ("false".equalsIgnoreCase(value)) { + ((BitVector) vector).setSafe(index, 0); + } else { + ((BitVector) vector).setSafe(index, Integer.parseInt(value)); + } + break; + case DATEDAY: + ((DateDayVector) vector).setSafe(index, Integer.parseInt(value)); + break; + case DATEMILLI: + ((DateMilliVector) vector).setSafe(index, 1000L * LocalDateTime.parse(value).getSecond()); + break; + case VARCHAR: + ((VarCharVector) vector).setSafe(index, value.getBytes(StandardCharsets.UTF_8)); + break; + case VARBINARY: + ((VarBinaryVector) vector).setSafe(index, value.getBytes(StandardCharsets.UTF_8)); + break; + } + } catch (NumberFormatException e) { + throw DataproxyException.of(DataproxyErrorCode.DATA_FORMAT_CONVERT_FAILED, + String.format("%s field data \"%s\" cannot be cast to %s", vector.getName(), value, vector.getMinorType()), e); + } + } + + @Override + public ArrowReader startRead() { + return this; + } + + @Override + public boolean loadNextBatch() throws IOException { + if (finished) { + return false; + } + + VectorSchemaRoot root = getVectorSchemaRoot(); + root.clear(); + preAllocate(root, ARROW_DATA_ROW_SIZE); + + int count = 0; + while (count < ARROW_DATA_ROW_SIZE) { + String dataLine = readNextLine(); + if (StringUtils.isEmpty(dataLine)) { + this.finished = true; + break; + } + + String[] serializedDataRow = parseLine(dataLine); + for (int col = 0; col < serializedDataRow.length; col++) { + FieldVector fieldVector = root.getVector(col); + addValueInVector(fieldVector, count, serializedDataRow[this.rawIndexList.get(col)]); + } + count++; + root.setRowCount(count); + } + return true; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() throws IOException { + try { + if (this.inputStream != null) { + this.inputStream.close(); + } + } catch (Exception ignored) { + } + } + + @Override + protected Schema readSchema() throws IOException { + return this.schema; + } + + /** + * Parse header + */ + private void parseHeader() { + // Parse header line + String headerLine = readNextLine(); + if (StringUtils.isBlank(headerLine)) { + throw DataproxyException.of(DataproxyErrorCode.HEADER_LINE_NOT_EXIST); + } + + String[] headerList = parseLine(headerLine); + if (headerList == null) { + throw DataproxyException.of(DataproxyErrorCode.HEADER_LINE_PARSE_FAILED); + } + this.headerList = Arrays.asList(headerList); + } + + /** + * Parse data line + * + * @param line Original data + * @return Split data list + */ + private String[] parseLine(String line) { + try { + return rowParser.parseLine(line); + } catch (IOException e) { + throw DataproxyException.of(DataproxyErrorCode.VALUE_LINE_PARSE_FAILED, e); + } + } + + /** + * Read next line + * + * @return Next line + */ + private String readNextLine() { + // First determine whether there is a line in the buffer + // If so, return directly, if not, try to download. + // If new data is downloaded, it is judged again whether there is a line. + // If there is no new data to download, the remaining data in the buffer is returned. + boolean continueDownload; + do { + continueDownload = !isLineInBuffer() && downloadRangeToBuffer(); + } while (continueDownload); + + try { + // Try to detect csv encoding during initialization + boolean isInit = false; + if (charset == null) { + isInit = true; + detectEncoding(buffer); + } + // Check if there is a BOM header, if so remove it + if (isInit) { + removeBom(buffer); + } + // Read data according to the recognized charset + return readLineOfCharset(buffer); + } catch (IOException e) { + throw DataproxyException.of(DataproxyErrorCode.READ_DATA_LINE_FAILED, e); + } + } + + // Detect and remove BOM header of CSV + private void removeBom(Buffer buffer) { + try { + if (buffer.size() != 0) { + ByteString firstLine = buffer.copy().readByteString(); + switch (firstLine.getByte(0) & 0xFF) { + case 0xEF: + if (firstLine.size() > 2 && + (firstLine.getByte(1) & 0xFF) == 0xBB + && (firstLine.getByte(2) & 0xFF) == 0xBF) { + buffer.skip(3); + } + break; + case 0xFE: + if (firstLine.size() > 3 && + (firstLine.getByte(1) & 0xFF) == 0xFF + && (firstLine.getByte(2) & 0xFF) == 0x00 + && ((firstLine.getByte(3) & 0xFF) == 0x00)) { + buffer.skip(4); + } else if (firstLine.size() > 1 + && ((firstLine.getByte(1) & 0xFF) == 0xFF)) { + buffer.skip(2); + } + break; + case 0x00: + if (firstLine.size() > 3) { + if ((firstLine.getByte(1) & 0xFF) == 0x00) { + if ((firstLine.getByte(2) & 0xFF) == 0xFE + && (firstLine.getByte(3) & 0xFF) == 0xFF) { + buffer.skip(4); + } else if ((firstLine.getByte(2) & 0xFF) == 0xFF + && (firstLine.getByte(3) & 0xFF) == 0xFE) { + buffer.skip(4); + } + } + } + break; + case 0xFF: + if (firstLine.size() > 3 && + (firstLine.getByte(1) & 0xFF) == 0xFE + && (firstLine.getByte(2) & 0xFF) == 0x00 + && ((firstLine.getByte(3) & 0xFF) == 0x00)) { + buffer.skip(4); + } else if (firstLine.size() > 1 + && ((firstLine.getByte(1) & 0xFF) == 0xFE)) { + buffer.skip(2); + } + break; + } + } + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.BOM_REMOVE_FAILED, e); + } + } + + // Detect charset + private void detectEncoding(Buffer buffer) { + try { + UniversalDetector detector = new UniversalDetector(null); + ByteString firstLine = buffer.copy().readByteString(); + detector.handleData(firstLine.toByteArray(), 0, firstLine.size()); + detector.dataEnd(); + if (detector.getDetectedCharset() != null) { + if (!Charset.forName(detector.getDetectedCharset()).equals(StandardCharsets.UTF_8)) { + // The consensus that is not UTF-8 is GB18030 + charset = Charset.forName(Constants.CHARSET_GB18030); + } else { + charset = StandardCharsets.UTF_8; + } + } else { + // Use UTF-8 when the detect fails + charset = StandardCharsets.UTF_8; + } + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.DETECT_ENCODING_FAILED, e); + } + } + + private String readLineOfCharset(Buffer buffer) throws IOException { + long locOfN = buffer.indexOf(ByteString.of((byte) '\n')); + if (locOfN != -1L) { + if (locOfN > 0 && buffer.getByte(locOfN - 1) == (byte) '\r') { + // \r\n + String result = buffer.readString(locOfN - 1, charset); + buffer.skip(2); + return result; + } else { + // \n + String result = buffer.readString(locOfN, charset); + buffer.skip(1); + return result; + } + } else if (buffer.size() != 0L) { + return buffer.readString(charset); + } else { + return null; + } + } + + private boolean downloadRangeToBuffer() { + if (inputStream == null) { + return false; + } + + try { + if (inputStream.available() == 0) { + return false; + } + + byte[] bytes = new byte[(int) FILE_READ_BATCH_SIZE]; + int length = inputStream.read(bytes); + buffer.write(bytes, 0, length); + return true; + } catch (IOException e) { + throw DataproxyException.of(DataproxyErrorCode.FILE_BATCH_DOWNLOAD_FAILED, e); + } + } + + private boolean isLineInBuffer() { + return buffer.indexOf((byte) '\n') != -1L; + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/FileSystemConnector.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/FileSystemConnector.java new file mode 100644 index 0000000..65228d7 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/filesystem/FileSystemConnector.java @@ -0,0 +1,241 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.filesystem; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.FlightContentFormatConfig; +import org.secretflow.dataproxy.common.model.FlightContentFormatTypeEnum; +import org.secretflow.dataproxy.common.model.InferSchemaResult; +import org.secretflow.dataproxy.common.model.command.DatasetReadCommand; +import org.secretflow.dataproxy.common.model.command.DatasetWriteCommand; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.format.CSVFormatConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasourceTypeEnum; +import org.secretflow.dataproxy.common.model.datasource.conn.ConnConfig; +import org.secretflow.dataproxy.common.model.datasource.conn.LocalFileSystemConnConfig; +import org.secretflow.dataproxy.common.model.datasource.conn.ObjectFileSystemConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.FileSystemLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.location.LocationConfig; +import org.secretflow.dataproxy.common.utils.DPStringUtils; +import org.secretflow.dataproxy.common.utils.JsonUtils; +import org.secretflow.dataproxy.manager.Connector; +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.DataWriter; +import org.secretflow.dataproxy.manager.SplitReader; + +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.net.URI; +import java.util.Arrays; +import java.util.List; + +/** + * File system connector + * + * @author muhong + * @date 2023-09-11 09:49 + */ +@Slf4j +public class FileSystemConnector implements Connector { + + /** + * Root uri + */ + private final String rootUri; + + /** + * Filesystem + */ + private final FileSystem fileSystem; + + /** + * 文件系统连接器构造函数 + * + * @param type 文件系统数据源类型 + * @param connConfig 文件系统数据源连接信息,根据类型不同而不同 + */ + public FileSystemConnector(DatasourceTypeEnum type, ConnConfig connConfig) { + // 文件系统参数构建 + Configuration configuration = new Configuration(); + switch (type) { + case MINIO: { + ObjectFileSystemConnConfig minioConnConfig = (ObjectFileSystemConnConfig) connConfig; + rootUri = generateUri(type.getScheme(), minioConnConfig.getBucket(), minioConnConfig.getObjectKeyPrefix()); + configuration.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"); + configuration.set("fs.s3a.endpoint", minioConnConfig.getEndpoint()); + configuration.set("fs.s3a.access.key", minioConnConfig.getAccessKey()); + configuration.set("fs.s3a.secret.key", minioConnConfig.getAccessSecret()); + configuration.set("fs.s3a.buffer.dir", "./dp/buffer"); + configuration.set("fs.s3a.connection.ssl.enabled", "false"); + configuration.setInt("fs.s3a.connection.timeout", 7200000); + // 减少重试次数,避免阻塞 + configuration.setInt("fs.s3a.attempts.maximum", 1); + configuration.setInt("fs.s3a.retry.limit", 1); + break; + } + case OSS: { + ObjectFileSystemConnConfig ossConnConfig = (ObjectFileSystemConnConfig) connConfig; + rootUri = generateUri(type.getScheme(), ossConnConfig.getBucket(), ossConnConfig.getObjectKeyPrefix()); + configuration.set("fs.oss.impl", "org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem"); + configuration.set("fs.oss.endpoint", ossConnConfig.getEndpoint()); + configuration.set("fs.oss.accessKeyId", ossConnConfig.getAccessKey()); + configuration.set("fs.oss.accessKeySecret", ossConnConfig.getAccessSecret()); + configuration.set("fs.oss.buffer.dir", "./dp/buffer"); + configuration.set("fs.oss.timeout.millisecond", String.valueOf(7200000)); + configuration.setInt("fs.oss.attempts.maximum", 1); + break; + } + case OBS: { + ObjectFileSystemConnConfig obsConnConfig = (ObjectFileSystemConnConfig) connConfig; + rootUri = generateUri(type.getScheme(), obsConnConfig.getBucket(), obsConnConfig.getObjectKeyPrefix()); + configuration.set("fs.obs.impl", "org.apache.hadoop.fs.obs.OBSFileSystem"); + configuration.set("fs.obs.endpoint", obsConnConfig.getEndpoint()); + configuration.set("fs.obs.accessKeyId", obsConnConfig.getAccessKey()); + configuration.set("fs.obs.accessKeySecret", obsConnConfig.getAccessSecret()); + configuration.set("fs.obs.buffer.dir", "./dp/buffer"); + configuration.set("fs.obs.timeout.millisecond", String.valueOf(7200000)); + configuration.setInt("fs.obs.attempts.maximum", 1); + break; + } + case LOCAL_HOST: { + LocalFileSystemConnConfig localFsConnConfig = (LocalFileSystemConnConfig) connConfig; + rootUri = generateUri(type.getScheme(), localFsConnConfig.getPath()); + break; + } + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的文件系统数据源" + type); + } + + // 构建文件系统连接器 + try { + this.fileSystem = FileSystem.newInstance(new URI(rootUri), configuration); + } catch (Exception e) { + log.error("[FileSystemConnector] 创建文件系统连接器失败,type:{}, config:{}", type, JsonUtils.toJSONString(connConfig), e); + throw DataproxyException.of(DataproxyErrorCode.CREATE_DATASOURCE_CONNECTOR_ERROR, e); + } + } + + @Override + public InferSchemaResult inferSchema(BufferAllocator allocator, LocationConfig locationConfig, DatasetFormatConfig formatConfig) { + String uri = generateFileUri(((FileSystemLocationConfig) locationConfig).getRelativePath()); + + DataReader dataReader = null; + switch (formatConfig.getType()) { + case CSV: + dataReader = new CSVDataReader(allocator, this.fileSystem, uri, null, (CSVFormatConfig) formatConfig.getFormatConfig(), null); + break; + case BINARY_FILE: + dataReader = new BinaryFileDataReader(allocator, this.fileSystem, uri); + break; + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的文件格式 " + formatConfig.getType()); + } + List splitReader = dataReader.createSplitReader(1); + try (ArrowReader arrowReader = splitReader.get(0).startRead()) { + return InferSchemaResult.builder() + .schema(arrowReader.getVectorSchemaRoot().getSchema()) + .datasetFormatConfig(DatasetFormatConfig.builder() + .type(formatConfig.getType()) + .formatConfig(formatConfig.getFormatConfig()) + .build()) + .build(); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.READER_RELEASE_FAILED, e); + } + } + + @Override + public DataReader buildReader(BufferAllocator allocator, DatasetReadCommand readCommand) { + FileSystemLocationConfig fileSystemLocationConfig = (FileSystemLocationConfig) readCommand.getLocationConfig().getLocationConfig(); + String uri = generateFileUri(fileSystemLocationConfig.getRelativePath()); + + FlightContentFormatConfig outputFormatConfig = readCommand.getOutputFormatConfig(); + switch (readCommand.getFormatConfig().getType()) { + case CSV: + // 存储格式为csv,输出指定为结构化数据(或缺省)时,结构化输出,其余都按二进制输出 + if (outputFormatConfig == null || outputFormatConfig.getFormatType() == FlightContentFormatTypeEnum.STRUCTURED_DATA) { + log.info("[FileSystemConnector - buildReader] 结构化数据读取,uri:{}", uri); + return new CSVDataReader(allocator, this.fileSystem, uri, readCommand.getSchema(), (CSVFormatConfig) readCommand.getFormatConfig().getFormatConfig(), readCommand.getFieldList()); + } + case BINARY_FILE: + log.info("[FileSystemConnector - buildReader] 二进制文件读取,uri:{}", uri); + return new BinaryFileDataReader(allocator, this.fileSystem, uri); + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的文件格式 " + readCommand.getFormatConfig().getType()); + } + } + + @Override + public DataWriter buildWriter(DatasetWriteCommand writeCommand) { + FileSystemLocationConfig fileSystemLocationConfig = (FileSystemLocationConfig) writeCommand.getLocationConfig().getLocationConfig(); + String uri = generateFileUri(fileSystemLocationConfig.getRelativePath()); + + FlightContentFormatConfig inputFormatConfig = writeCommand.getInputFormatConfig(); + switch (writeCommand.getFormatConfig().getType()) { + case CSV: + if (inputFormatConfig == null || inputFormatConfig.getFormatType() == FlightContentFormatTypeEnum.STRUCTURED_DATA) { + log.info("[FileSystemConnector - buildWriter] STRUCTURED_DATA,uri:{}", uri); + return new CSVDataWriter(this.fileSystem, uri, (CSVFormatConfig) writeCommand.getFormatConfig().getFormatConfig()); + } + case BINARY_FILE: + log.info("[FileSystemConnector - buildWriter] BINARY_FILE,uri:{}", uri); + return new BinaryFileDataWriter(this.fileSystem, uri); + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的文件格式 " + writeCommand.getFormatConfig().getType()); + } + } + + @Override + public boolean isAvailable() { + try { + this.fileSystem.getFileStatus(new Path(rootUri)); + return true; + } catch (Exception e) { + log.info("[FileSystemConnector] check status error, uri:{}", this.rootUri, e); + return false; + } + } + + @Override + public void close() throws Exception { + this.fileSystem.close(); + } + + /** + * 生成文件路径 + * + * @param scheme 协议类型 + * @param path + * @return + */ + private String generateUri(String scheme, String... path) { + return scheme + + DPStringUtils.joinWithoutEmpty("/", + Arrays.stream(path).map(item -> DPStringUtils.removeDecorateIdentifier(item, "/")).toArray(String[]::new) + ); + } + + private String generateFileUri(String relativePath){ + return DPStringUtils.removeDecorateIdentifier(this.rootUri, "/") + "/" + DPStringUtils.removeDecorateIdentifier(relativePath, "/"); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsConnector.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsConnector.java new file mode 100644 index 0000000..6e12feb --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsConnector.java @@ -0,0 +1,121 @@ +/* + * Copyright 2024 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.secretflow.dataproxy.manager.connector.odps; + +import com.aliyun.odps.tunnel.TunnelException; +import org.apache.arrow.memory.BufferAllocator; +import org.secretflow.dataproxy.common.model.InferSchemaResult; +import org.secretflow.dataproxy.common.model.command.DatasetReadCommand; +import org.secretflow.dataproxy.common.model.command.DatasetWriteCommand; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.format.DatasetFormatTypeEnum; +import org.secretflow.dataproxy.common.model.datasource.DatasourceConnConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasourceTypeEnum; +import org.secretflow.dataproxy.common.model.datasource.conn.ConnConfig; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.LocationConfig; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.dataproxy.manager.Connector; +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.DataWriter; + +import java.io.IOException; +import java.util.Objects; + +/** + * odps Connector + * + * @author yuexie + * @date 2024-06-01 17:08:45 + */ +public class OdpsConnector implements Connector { + + /** + * odps connection config + */ + private final OdpsConnConfig config; + + public OdpsConnector(ConnConfig config) { + if (!(config instanceof OdpsConnConfig odpsConnConfig)) { + throw new IllegalArgumentException("Invalid conn config type."); + } + this.config = odpsConnConfig; + } + + @Override + public InferSchemaResult inferSchema(BufferAllocator allocator, LocationConfig locationConfig, DatasetFormatConfig formatConfig) { + + return InferSchemaResult.builder() + .datasetFormatConfig(formatConfig) + .schema(null) + .build(); + + } + + @Override + public DataReader buildReader(BufferAllocator allocator, DatasetReadCommand readCommand) { + + if (invalidateConnectionType(readCommand.getConnConfig())) { + throw new IllegalArgumentException("[ODPS] Unsupported datasource type."); + } + + if (Objects.equals(DatasetFormatTypeEnum.TABLE, readCommand.getFormatConfig().getType())) { + return new OdpsDataReader(allocator, config, (OdpsTableInfo) readCommand.getLocationConfig().getLocationConfig(), readCommand.getSchema()); + } + return new OdpsResourceReader(allocator, config, (OdpsTableInfo) readCommand.getLocationConfig().getLocationConfig()); + } + + @Override + public DataWriter buildWriter(DatasetWriteCommand writeCommand) { + + if (invalidateConnectionType(writeCommand.getConnConfig())) { + throw new IllegalArgumentException("[ODPS] Unsupported datasource type."); + } + OdpsTableInfo locationConfig = (OdpsTableInfo) writeCommand.getLocationConfig().getLocationConfig(); + + if (Objects.equals(DatasetFormatTypeEnum.TABLE, writeCommand.getFormatConfig().getType())) { + try { + return new OdpsDataWriter(config, locationConfig, writeCommand.getSchema()); + } catch (TunnelException | IOException e) { + throw new RuntimeException(e); + } + } + return new OdpsResourceWriter(config, locationConfig); + } + + @Override + public boolean isAvailable() { + return true; + } + + @Override + public void close() throws Exception { + // odps no function to close + } + + /** + * 判断是否为无效的 type + * + * @param connConfig 连接配置 + * @return boolean true 表示无效 + */ + private boolean invalidateConnectionType(DatasourceConnConfig connConfig) { + if (connConfig == null || connConfig.getType() == null) { + return true; + } + return connConfig.getType() != DatasourceTypeEnum.ODPS; + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsDataReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsDataReader.java new file mode 100644 index 0000000..d848e52 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsDataReader.java @@ -0,0 +1,52 @@ +/* + * Copyright 2024 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.secretflow.dataproxy.manager.connector.odps; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.pojo.Schema; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.SplitReader; + +import java.util.List; + +/** + * odps Table Reader + * + * @author yuexie + * @date 2024-06-01 17:08:45 + */ +public class OdpsDataReader implements DataReader { + + private final OdpsConnConfig odpsConnConfig; + private final BufferAllocator allocator; + private final OdpsTableInfo tableInfo; + private final Schema schema; + + public OdpsDataReader(BufferAllocator allocator, OdpsConnConfig odpsConnConfig, OdpsTableInfo tableInfo, Schema schema) { + this.odpsConnConfig = odpsConnConfig; + this.allocator = allocator; + this.tableInfo = tableInfo; + this.schema = schema; + } + + @Override + public List createSplitReader(int splitNumber) { + // TODO: spilt reader + return List.of(new OdpsSplitArrowReader(allocator, odpsConnConfig, tableInfo, schema)); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsDataWriter.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsDataWriter.java new file mode 100644 index 0000000..32bdad3 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsDataWriter.java @@ -0,0 +1,318 @@ +/* + * Copyright 2024 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.secretflow.dataproxy.manager.connector.odps; + + +import com.aliyun.odps.Column; +import com.aliyun.odps.Odps; +import com.aliyun.odps.OdpsType; +import com.aliyun.odps.PartitionSpec; +import com.aliyun.odps.TableSchema; +import com.aliyun.odps.data.Record; +import com.aliyun.odps.data.RecordWriter; +import com.aliyun.odps.tunnel.TableTunnel; +import com.aliyun.odps.tunnel.TunnelException; +import com.aliyun.odps.type.TypeInfo; +import com.aliyun.odps.type.TypeInfoFactory; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.dataproxy.manager.DataWriter; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.List; + +/** + * odps Table Writer + * + * @author yuexie + * @date 2024-06-01 17:08:45 + */ +@Slf4j +public class OdpsDataWriter implements DataWriter { + + + private final OdpsConnConfig connConfig; + + private final OdpsTableInfo tableInfo; + + private final Schema schema; + + private final boolean overwrite = true; + + private TableTunnel.UploadSession uploadSession = null; + private RecordWriter recordWriter = null; + + public OdpsDataWriter(OdpsConnConfig connConfig, OdpsTableInfo tableInfo, Schema schema) throws TunnelException, IOException { + this.connConfig = connConfig; + this.tableInfo = tableInfo; + this.schema = schema; + initOdps(); + } + + @Override + public void write(VectorSchemaRoot root) throws IOException { + + final int batchSize = root.getRowCount(); + log.info("odps writer batchSize: {}", batchSize); + int columnCount = root.getFieldVectors().size(); + + TableSchema tableSchema = uploadSession.getSchema(); + + Record record; + String columnName; + + for (int rowIndex = 0; rowIndex < batchSize; rowIndex++) { + record = uploadSession.newRecord(); + + for (int columnIndex = 0; columnIndex < columnCount; columnIndex++) { + log.debug("column: {}, type: {}", columnIndex, root.getFieldVectors().get(columnIndex).getField().getType()); + columnName = root.getVector(columnIndex).getField().getName(); + + if (tableSchema.containsColumn(columnName)) { + this.setRecordValue(record, tableSchema.getColumnIndex(columnName), this.getValue(root.getFieldVectors().get(columnIndex), rowIndex)); + } else { + log.warn("column: `{}` not exists in table: {}", columnName, tableInfo.tableName()); + } + + } + recordWriter.write(record); + log.debug("record: {}", record); + } + + } + + @Override + public void flush() throws IOException { + try { + if (recordWriter != null) { + recordWriter.close(); + } + if (uploadSession != null) { + uploadSession.commit(); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void destroy() throws IOException { + + } + + @Override + public void close() throws Exception { + // odps no close function + } + + private Odps initOdpsClient(OdpsConnConfig odpsConnConfig) { + + if (odpsConnConfig == null) { + throw new IllegalArgumentException("connConfig is null"); + } + + return OdpsUtil.buildOdps(odpsConnConfig); + } + + private void initOdps() throws TunnelException, IOException { + // init odps client + Odps odps = initOdpsClient(this.connConfig); + // Pre-processing + preProcessing(odps, connConfig.getProjectName(), tableInfo.tableName()); + // init download session + TableTunnel tunnel = new TableTunnel(odps); + if (tableInfo.partitionSpec() != null && !tableInfo.partitionSpec().isEmpty()) { + PartitionSpec partitionSpec = new PartitionSpec(tableInfo.partitionSpec()); + uploadSession = tunnel.createUploadSession(connConfig.getProjectName(), tableInfo.tableName(), partitionSpec, overwrite); + } else { + uploadSession = tunnel.createUploadSession(connConfig.getProjectName(), tableInfo.tableName(), overwrite); + } + + recordWriter = uploadSession.openRecordWriter(0); + } + + /** + * 类型不匹配时需要做处理,未处理的报错 + * TODO: 双方类型绑定设计,未绑定时报错提示,当前先做简单类型转换 + * 例: Record: FLOAT、DOUBLE -> Arrow: floatingpoint + * + * @param record ODPS记录 + * @param columnIndex 列索引 + * @param value 值 + */ + private void setRecordValue(Record record, int columnIndex, Object value) { + if (value == null) { + record.set(columnIndex, null); + return; + } + + Column column = record.getColumns()[columnIndex]; + + OdpsType odpsType = column.getTypeInfo().getOdpsType(); + log.debug("record odps type: {}", odpsType); + switch (odpsType) { + case STRING -> record.setString(columnIndex, String.valueOf(value)); + case FLOAT -> record.set(columnIndex, Float.parseFloat(String.valueOf(value))); + case DOUBLE -> record.set(columnIndex, Double.parseDouble(String.valueOf(value))); + case BIGINT -> record.set(columnIndex, Long.parseLong(String.valueOf(value))); + case INT -> record.set(columnIndex, Integer.parseInt(String.valueOf(value))); + default -> record.set(columnIndex, value); + } + } + + /** + * 获取字段数据 + * + * @param fieldVector field vector + * @param index index + * @return value + */ + private Object getValue(FieldVector fieldVector, int index) { + if (fieldVector == null || index < 0) { + return null; + } + ArrowType.ArrowTypeID arrowTypeID = fieldVector.getField().getType().getTypeID(); + + switch (arrowTypeID) { + case Int -> { + if (fieldVector instanceof IntVector || fieldVector instanceof BigIntVector || fieldVector instanceof SmallIntVector) { + return fieldVector.getObject(index); + } + } + case FloatingPoint -> { + if (fieldVector instanceof Float4Vector | fieldVector instanceof Float8Vector) { + return fieldVector.getObject(index); + } + } + case Utf8 -> { + if (fieldVector instanceof VarCharVector vector) { + return new String(vector.get(index), StandardCharsets.UTF_8); + } + } + case Null -> { + return null; + } + default -> { + log.warn("Not implemented type: {}, will use default function", arrowTypeID); + return fieldVector.getObject(index); + } + + } + return null; + } + + /** + * Pre-processing + *
1. 表存在校验,不存在时创建表 + * + * @param odps odps client + * @param projectName project name + * @param tableName table name + */ + private void preProcessing(Odps odps, String projectName, String tableName) { + + if (!isExistsTable(odps, projectName, tableName)) { + boolean odpsTable = createOdpsTable(odps, projectName, tableName, schema); + if (!odpsTable) { + throw DataproxyException.of(DataproxyErrorCode.ODPS_CREATE_TABLE_FAILED); + } + } + log.info("odps table is exists or create table successful, project: {}, table name: {}", projectName, tableName); + } + + /** + * check Table is exist + * + * @param odps odps client + * @param projectName project name + * @param tableName table name + * @return true or false + */ + private boolean isExistsTable(Odps odps, String projectName, String tableName) { + try { + return odps.tables().exists(projectName, tableName); + } catch (Exception e) { + log.error("check exists table error, projectName:{}, tableName:{}", projectName, tableName, e); + } + return false; + } + + private boolean createOdpsTable(Odps odps, String projectName, String tableName, Schema schema) { + try { + odps.tables().create(projectName, tableName, convertToTableSchema(schema), true); + return true; + } catch (Exception e) { + log.error("create odps table error, projectName:{}, tableName:{}", projectName, tableName, e); + } + return false; + } + + private TableSchema convertToTableSchema(Schema schema) { + List columns = schema.getFields().stream().map(this::convertToColumn).toList(); + return TableSchema.builder().withColumns(columns).build(); + } + + private Column convertToColumn(Field field) { + return Column.newBuilder(field.getName(), convertToType(field.getType())).build(); + } + + private TypeInfo convertToType(ArrowType type) { + + ArrowType.ArrowTypeID arrowTypeID = type.getTypeID(); + + switch (arrowTypeID) { + case Utf8 -> { + return TypeInfoFactory.STRING; + } + case FloatingPoint -> { + + return switch (((ArrowType.FloatingPoint) type).getPrecision()) { + case SINGLE -> TypeInfoFactory.FLOAT; + case DOUBLE -> TypeInfoFactory.DOUBLE; + default -> TypeInfoFactory.UNKNOWN; + }; + } + case Int -> { + return TypeInfoFactory.INT; + } + case Time -> { + return TypeInfoFactory.TIMESTAMP; + } + case Date -> { + return TypeInfoFactory.DATE; + } + default -> { + log.warn("Not implemented type: {}", arrowTypeID); + return TypeInfoFactory.UNKNOWN; + } + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceReader.java new file mode 100644 index 0000000..c2f20a3 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceReader.java @@ -0,0 +1,48 @@ +/* + * Copyright 2024 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.secretflow.dataproxy.manager.connector.odps; + +import org.apache.arrow.memory.BufferAllocator; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.SplitReader; + +import java.util.List; + +/** + * odps Resource Reader + * + * @author yuexie + * @date 2024-06-01 17:08:45 + */ +public class OdpsResourceReader implements DataReader { + + private final OdpsConnConfig odpsConnConfig; + private final BufferAllocator allocator; + private final OdpsTableInfo tableInfo; + + public OdpsResourceReader(BufferAllocator allocator, OdpsConnConfig odpsConnConfig, OdpsTableInfo tableInfo) { + this.odpsConnConfig = odpsConnConfig; + this.allocator = allocator; + this.tableInfo = tableInfo; + } + + @Override + public List createSplitReader(int splitNumber) { + return List.of(new OdpsResourceSplitReader(allocator, odpsConnConfig, tableInfo)); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceSplitReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceSplitReader.java new file mode 100644 index 0000000..56439c6 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceSplitReader.java @@ -0,0 +1,159 @@ +/* + * Copyright 2024 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.secretflow.dataproxy.manager.connector.odps; + +import com.aliyun.odps.Odps; +import com.aliyun.odps.OdpsException; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.dataproxy.manager.SplitReader; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Objects; + +/** + * odps Resource Split Reader + * + * @author yuexie + * @date 2024-06-01 17:08:45 + */ +@Slf4j +public class OdpsResourceSplitReader extends ArrowReader implements SplitReader { + + private static final String FIELD_NAME = "binary_data"; + private static final int BATCH_SIZE = 3 * 1024 * 1024; + + private final OdpsConnConfig odpsConnConfig; + + private final OdpsTableInfo tableInfo; + + private InputStream inputStream; + + + private int readIndex = 0; + + protected OdpsResourceSplitReader(BufferAllocator allocator, OdpsConnConfig odpsConnConfig, OdpsTableInfo tableInfo) { + super(allocator); + this.odpsConnConfig = odpsConnConfig; + this.tableInfo = tableInfo; + } + + @Override + public ArrowReader startRead() { + + Odps odps = OdpsUtil.buildOdps(odpsConnConfig); + try { + inputStream = odps.resources().getResourceAsStream(tableInfo.tableName()); + } catch (OdpsException e) { + throw new RuntimeException(e); + } + + return this; + } + + @Override + public boolean loadNextBatch() throws IOException { + VectorSchemaRoot root = getVectorSchemaRoot(); + root.clear(); + + VarBinaryVector vector = (VarBinaryVector) root.getVector(FIELD_NAME); + vector.allocateNew(1); + + // 申请足够空间 + while (vector.getDataBuffer().capacity() < BATCH_SIZE) { + vector.reallocDataBuffer(); + } + + ArrowBuf dataBuffer = vector.getDataBuffer(); + + int l = readRangeToBuffer(dataBuffer, 0); + if (l == 0) { + return false; + } + + readIndex += l; + + vector.getOffsetBuffer().setInt(VarBinaryVector.OFFSET_WIDTH, l); + BitVectorHelper.setBit(vector.getValidityBuffer(), 0); + vector.setLastSet(0); + + root.setRowCount(1); + + return true; + } + + @Override + public long bytesRead() { + + try { + if (inputStream != null) { + return inputStream.available(); + } + throw DataproxyException.of(DataproxyErrorCode.FILE_READ_STREAM_CREATE_FAILED); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + protected void closeReadSource() throws IOException { + try { + if (Objects.nonNull(inputStream)) { + inputStream.close(); + } + } catch (IOException ignored) { + + } + } + + @Override + protected Schema readSchema() throws IOException { + return new Schema(List.of(Field.notNullable(FIELD_NAME, new ArrowType.Binary()))); + } + + private int readRangeToBuffer(ArrowBuf valueBuffer, int startIndex) { + if (inputStream == null) { + return 0; + } + + try { + if (inputStream.available() == 0) { + return 0; + } + + byte[] bytes = new byte[1024]; + int length = inputStream.read(bytes); + valueBuffer.writeBytes(bytes, startIndex, length); + return length; + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceWriter.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceWriter.java new file mode 100644 index 0000000..772e172 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsResourceWriter.java @@ -0,0 +1,123 @@ +/* + * Copyright 2024 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.secretflow.dataproxy.manager.connector.odps; + + +import com.aliyun.odps.FileResource; +import com.aliyun.odps.NoSuchObjectException; +import com.aliyun.odps.Odps; +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.Resource; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VarBinaryVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.dataproxy.manager.DataWriter; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * odps Resource Writer + * + * @author yuexie + * @date 2024-06-01 17:08:45 + */ +@Slf4j +public class OdpsResourceWriter implements DataWriter { + + private final OdpsConnConfig odpsConnConfig; + private final OdpsTableInfo odpsTableInfo; + + private Odps odps; + + private static final String FIELD_NAME = "binary_data"; + + private InputStream odpsInputStream = null; + + public OdpsResourceWriter(OdpsConnConfig odpsConnConfig, OdpsTableInfo odpsTableInfo) { + this.odpsConnConfig = odpsConnConfig; + this.odpsTableInfo = odpsTableInfo; + initOdps(); + } + + + @Override + public void write(VectorSchemaRoot root) throws IOException { + + FieldVector vector = root.getVector(FIELD_NAME); + + if (vector instanceof VarBinaryVector varBinaryVector) { + + int rowCount = root.getRowCount(); + for (int row = 0; row < rowCount; row++) { + byte[] bytes = varBinaryVector.get(row); + + odpsInputStream = new ByteArrayInputStream(bytes); + FileResource fileResource = new FileResource(); + fileResource.setName(odpsTableInfo.tableName()); + try { + if (resourceExists(odps, odpsTableInfo.tableName())) { + odps.resources().update(fileResource, odpsInputStream); + } else { + odps.resources().create(fileResource, odpsInputStream); + } + } catch (OdpsException e) { + throw new RuntimeException(e); + } + } + } else { + throw DataproxyException.of(DataproxyErrorCode.UNSUPPORTED_FIELD_TYPE, "Only support VarBinaryVector type"); + } + + } + + @Override + public void flush() throws IOException { + + } + + @Override + public void destroy() throws IOException { + + } + + @Override + public void close() throws Exception { + if (odpsInputStream != null) { + odpsInputStream.close(); + } + } + + private void initOdps() { + odps = OdpsUtil.buildOdps(odpsConnConfig); + } + + private static boolean resourceExists(Odps odps, String resourceName) throws OdpsException { + try { + Resource resource = odps.resources().get(resourceName); + resource.reload(); + return true; + } catch (NoSuchObjectException e) { + return false; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsSplitArrowReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsSplitArrowReader.java new file mode 100644 index 0000000..29655b5 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsSplitArrowReader.java @@ -0,0 +1,349 @@ +/* + * Copyright 2024 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.secretflow.dataproxy.manager.connector.odps; + +import com.aliyun.odps.Instance; +import com.aliyun.odps.Odps; +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.TableSchema; +import com.aliyun.odps.data.Record; +import com.aliyun.odps.data.ResultSet; +import com.aliyun.odps.task.SQLTask; +import com.aliyun.odps.utils.StringUtils; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ValueVectorUtility; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.dataproxy.manager.SplitReader; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * odps Table Split Reader + * + * @author yuexie + * @date 2024-06-01 17:08:45 + */ +@Slf4j +public class OdpsSplitArrowReader extends ArrowReader implements SplitReader, AutoCloseable { + + private final OdpsConnConfig odpsConnConfig; + + private final OdpsTableInfo tableInfo; + + private final Schema schema; + + private TableSchema tableSchema; + + private final int batchSize = 1000; + + private ResultSet resultSet; + + private final Pattern columnOrValuePattern = Pattern.compile("^[\\u00b7A-Za-z0-9\\u4e00-\\u9fa5\\-_,.]*$"); + + protected OdpsSplitArrowReader(BufferAllocator allocator, OdpsConnConfig odpsConnConfig, OdpsTableInfo tableInfo, Schema schema) { + super(allocator); + this.odpsConnConfig = odpsConnConfig; + this.tableInfo = tableInfo; + this.schema = schema; + } + + @Override + public boolean loadNextBatch() throws IOException { + VectorSchemaRoot root = getVectorSchemaRoot(); + root.clear(); + + ValueVectorUtility.preAllocate(root, batchSize); + Record next; + + int recordCount = 0; + if (!resultSet.hasNext()) { + return false; + } + while (resultSet.hasNext()) { + next = resultSet.next(); + if (next != null) { + + ValueVectorUtility.ensureCapacity(root, recordCount + 1); + toArrowVector(next, root, recordCount); + recordCount++; + } + + if (recordCount == batchSize) { + root.setRowCount(recordCount); + return true; + } + } + root.setRowCount(recordCount); + return true; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() throws IOException { + + } + + @Override + protected Schema readSchema() throws IOException { + return this.schema; + } + + @Override + public ArrowReader startRead() { + + Odps odps = OdpsUtil.buildOdps(odpsConnConfig); + String sql = ""; + try { + sql = this.buildSql(tableInfo.tableName(), tableInfo.fields(), tableInfo.partitionSpec()); + log.debug("SQLTask run sql: {}", sql); + + Instance instance = SQLTask.run(odps, sql); + // 等待任务完成 + instance.waitForSuccess(); + + resultSet = SQLTask.getResultSet(instance); + + tableSchema = resultSet.getTableSchema(); + + } catch (OdpsException e) { + log.error("SQLTask run error, sql: {}", sql, e); + throw DataproxyException.of(DataproxyErrorCode.ODPS_ERROR, e.getMessage(), e); + } catch (IOException e) { + log.error("startRead error, sql: {}", sql, e); + throw new RuntimeException(e); + } + + return this; + } + + private String buildSql(String tableName, List fields, String partition) { + + if (!columnOrValuePattern.matcher(tableName).matches()) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "Invalid tableName:" + tableName); + } + + String transformedPartition = buildWhereClause(partition); + return "select " + String.join(",", fields) + " from " + tableName + (transformedPartition.isEmpty() ? "" : " where " + transformedPartition) + ";"; + } + + /** + * 过时方法,后续删除 + * + * @param partition 分区字段 + * @return boolean + */ + @Deprecated + private String transformPartition(String partition) { + + Map> fieldValuesMap = new HashMap<>(); + + if (partition != null) { + String[] split = StringUtils.split(partition, ';'); + for (String s : split) { + String[] kv = StringUtils.split(s, '='); + if (kv.length != 2 || kv[0].isEmpty() || kv[1].isEmpty()) { + throw DataproxyException.of(DataproxyErrorCode.INVALID_PARTITION_SPEC); + } + if (fieldValuesMap.containsKey(kv[0])) { + fieldValuesMap.get(kv[0]).add(kv[1]); + } else { + fieldValuesMap.put(kv[0], new ArrayList<>(List.of(kv[1]))); + } + } + } + + return buildEqualClause(fieldValuesMap).toString(); + } + + /** + * 构造转换等于号多值条件至 "in" 条件,单值保留为 "=" 条件
+ * + * @param fieldValuesMap 字段值 + * @return where clause string + */ + private StringBuilder buildEqualClause(Map> fieldValuesMap) { + StringBuilder sb = new StringBuilder(); + if (!fieldValuesMap.isEmpty()) { + + boolean first = true; + for (Map.Entry> entry : fieldValuesMap.entrySet()) { + if (!first) { + sb.append(" and "); + } + first = false; + sb.append(entry.getKey()); + List values = entry.getValue(); + if (values.size() > 1) { + sb.append(" in ("); + for (String value : values) { + sb.append("'").append(value).append("'").append(", "); + } + sb.setLength(sb.length() - 2); + sb.append(")"); + } else { + sb.append(" = ").append("'").append(values.get(0)).append("'"); + } + } + } + + return sb; + } + + /** + * TODO: 对于通过 JDBC 操作的方式,可以把这块逻辑抽出来 + * + * @param conditionString 条件字段 + * @return where clause + */ + private String buildWhereClause(String conditionString) { + + if (conditionString == null || conditionString.isEmpty()) { + return ""; + } + + String[] conditions = conditionString.split(";"); + + StringBuilder whereClause = new StringBuilder(); + Pattern pattern = Pattern.compile("^(\\w+)(>=|<=|<>|!=|=|>|<| LIKE | like )(.*)$"); + + + Map> equalFieldValuesMap = new HashMap<>(); + + for (String condition : conditions) { + Matcher matcher = pattern.matcher(condition.trim()); + + if (!matcher.matches() || matcher.groupCount() != 3) { + throw new DataproxyException(DataproxyErrorCode.INVALID_PARTITION_SPEC, "Invalid condition format: " + condition); + } + + String column = matcher.group(1).trim(); + String operator = matcher.group(2); + String value = matcher.group(3).trim(); + + if (!columnOrValuePattern.matcher(column).matches()) { + throw new DataproxyException(DataproxyErrorCode.INVALID_PARTITION_SPEC, "Invalid condition format: " + column); + } + + if (!columnOrValuePattern.matcher(value).matches()) { + throw new DataproxyException(DataproxyErrorCode.INVALID_PARTITION_SPEC, "Invalid condition format: " + column); + } + + // 安全处理用户输入的值,可以根据具体需要进行处理 + value = value.replace("'", "''"); // 简单处理单引号转义 + + if ("=".equals(operator)) { + if (equalFieldValuesMap.containsKey(column)) { + equalFieldValuesMap.get(column).add(value); + } else { + equalFieldValuesMap.put(column, new ArrayList<>(List.of(value))); + } + } else { + if (!whereClause.isEmpty()) { + whereClause.append(" and "); + } + whereClause.append(column).append(' ').append(operator).append(" '").append(value).append("'"); + } + } + StringBuilder equalFieldClause = buildEqualClause(equalFieldValuesMap); + + if (whereClause.isEmpty()) { + return equalFieldClause.toString(); + } + + if (!equalFieldClause.isEmpty()) { + whereClause.append(" and ").append(equalFieldClause); + } + return whereClause.toString(); + } + + private void toArrowVector(Record record, VectorSchemaRoot root, int rowIndex) throws IOException { + FieldVector vector; + String columnName; + for (Field field : schema.getFields()) { + vector = root.getVector(field); + if (vector != null) { + columnName = field.getName(); + if (tableSchema.containsColumn(columnName)) { + setValue(vector.getField().getType(), vector, rowIndex, record, columnName); + vector.setValueCount(rowIndex + 1); + } + } + } + } + + private void setValue(ArrowType type, FieldVector vector, int rowIndex, Record record, String columnName) { + log.debug("columnName: {} type ID: {}, value: {}", columnName, type.getTypeID(), record.get(columnName)); + if (record.get(columnName) == null) { + return; + } + switch (type.getTypeID()) { + case Int -> { + if (vector instanceof IntVector intVector) { + intVector.setSafe(rowIndex, Integer.parseInt(record.get(columnName).toString())); + } else if (vector instanceof BigIntVector bigIntVector) { + bigIntVector.setSafe(rowIndex, Long.parseLong(record.get(columnName).toString())); + } else { + log.warn("Unsupported type: {}", type); + } + } + case Utf8 -> { + if (vector instanceof VarCharVector varcharVector) { + varcharVector.setSafe(rowIndex, record.getString(columnName).getBytes(StandardCharsets.UTF_8)); + } else { + log.warn("Unsupported type: {}", type); + } + } + case FloatingPoint -> { + if (vector instanceof Float4Vector floatVector) { + floatVector.setSafe(rowIndex, Float.parseFloat(record.get(columnName).toString())); + } else if (vector instanceof Float8Vector doubleVector) { + doubleVector.setSafe(rowIndex, Double.parseDouble(record.get(columnName).toString())); + } else { + log.warn("Unsupported type: {}", type); + } + } + + default -> throw new IllegalArgumentException("Unsupported type: " + type); + } + + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsUtil.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsUtil.java new file mode 100644 index 0000000..c0d69fb --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/odps/OdpsUtil.java @@ -0,0 +1,39 @@ +/* + * Copyright 2024 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.secretflow.dataproxy.manager.connector.odps; + +import com.aliyun.odps.Odps; +import com.aliyun.odps.account.Account; +import com.aliyun.odps.account.AliyunAccount; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; + +/** + * odps util + * + * @author yuexie + * @date 2024-06-01 17:08:45 + */ +public class OdpsUtil { + + public static Odps buildOdps(OdpsConnConfig odpsConnConfig) { + Account account = new AliyunAccount(odpsConnConfig.getAccessKeyId(), odpsConnConfig.getAccessKeySecret()); + Odps odps = new Odps(account); + odps.setEndpoint(odpsConnConfig.getEndpoint()); + odps.setDefaultProject(odpsConnConfig.getProjectName()); + + return odps; + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcAssistant.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcAssistant.java new file mode 100644 index 0000000..2c9d7cf --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcAssistant.java @@ -0,0 +1,324 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.dataset.format.IndexType; +import org.secretflow.dataproxy.common.model.dataset.format.TableFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.format.TableIndex; +import org.secretflow.dataproxy.common.model.datasource.conn.JdbcBaseConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.JdbcLocationConfig; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder.ColumnBinder; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.Text; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.sql.Connection; +import java.sql.JDBCType; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * jdbc辅助类 + * + * @author muhong + * @date 2023-09-07 19:55 + */ +public interface JdbcAssistant { + + /** + * 获取连接校验查询语句 + * + * @return + */ + String getConnectionTestQuery(); + + /** + * 获取连接驱动类 + * + * @return + */ + String getDriverClass(); + + /** + * 初始化 datasource 的 url、catalog 和 schema 配置 + * + * @param config datasource hikari config + * @param connConfig jdbc 连接参数 + */ + void initDataSourceConfig(HikariConfig config, C connConfig); + + default void fillDefaultValue(C connConfig, L locationConfig) { + } + + /** + * 装饰标识符,如 mysql 将在标识符前后加上 `
+ * 标识符包括: database, table, index, column, alias, view, stored procedure, partition, + * tablespace, resource group and other object + * + * @param identifier 装饰前的标识符 + * @return 装饰后的标识符 + */ + String decorateIdentifier(String identifier); + + default String decorateStrValue(String value) { + return "'" + value + "'"; + } + + /** + * 组装 tableName,如 DB2 将改变 tableName 为 schemaName.tableName + * + * @param locationConfig + * @return 组装后的 tableName + */ + String composeTableName(L locationConfig); + + /** + * 是否支持PreparedStatement批量插入 + * + * @return + */ + default boolean supportBatchInsert() { + return true; + } + + /** + * 生成列查询语句 + * + * @param rawFieldName + * @param composeTableName + * @return + */ + default String createFieldPart(List rawFieldName, String composeTableName) { + List requestedColumnNameList = rawFieldName.stream() + .map(this::decorateIdentifier) + .collect(Collectors.toList()); + StringBuilder selectSqlBuilder = new StringBuilder(); + selectSqlBuilder.append(StringUtils.join(requestedColumnNameList, ", ")); + selectSqlBuilder.append(" from ").append(composeTableName).append(" "); + return selectSqlBuilder.toString(); + } + + /** + * 查询SQL模板 + * + * @return 查询SQL模板 + */ + default String selectSQLTemplate() { + return "select ${sqlPart} ${limitPart}"; + } + + default String generateLimitConditionTemplate(boolean otherFilter) { + return "limit %s"; + } + + /** + * 建表SQL + * + * @param schema arrow数据格式 + * @param formatConfig 数据格式配置 + * @return SQL + */ + default String createTableSql(String composeTableName, Schema schema, TableFormatConfig formatConfig) { + return "CREATE TABLE " + composeTableName + " (" + + createTableColumnTypes(schema.getFields(), formatConfig) + + "," + decorateIdentifier(formatConfig.getPrimaryKey()) + + " BIGINT PRIMARY KEY NOT NULL AUTO_INCREMENT" + + createIndex(formatConfig.getIndexList(), schema.getFields()) + + ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"; + } + + /** + * 在目标数据库里预先执行 SQL 的列表 + * + * @param composeTableName 列定义对象列表 + * @param schema 数据结构 + * @param formatConfig + * @return SQL 列表 + */ + List preWorkSqls(String composeTableName, Schema schema, L locationConfig, TableFormatConfig formatConfig); + + /** + * 生成drop table sql 语句 + * + * @param composeTableName 组合后完整的表名 + * @return sql + */ + default String dropTableSql(String composeTableName) { + return "DROP TABLE IF EXISTS " + composeTableName; + } + + /** + * 创建列定义sql + * + * @param fields 列定义对象列表 + * @return 列定义sql + */ + default String createTableColumnTypes(List fields, TableFormatConfig formatConfig) { + return fields.stream() + .filter(field -> !field.getName().equals(formatConfig.getPrimaryKey())) + .map(this::arrowFieldToSqlColumnDefinition) + .collect(Collectors.joining(",")); + } + + /** + * 单列定义sql + * + * @param field 列定义对象 + * @return sql + */ + default String arrowFieldToSqlColumnDefinition(Field field) { + return decorateIdentifier(field.getName()) + " " + jdbcTypeToDbTypeString(arrowTypeToJdbcType(field)); + } + + /** + * arrow field 转 jdbctype + * + * @param field + * @return + */ + default JDBCType arrowTypeToJdbcType(Field field) { + // 定义一个临时vector + try (BufferAllocator tempAllocator = new RootAllocator(); + FieldVector tempVector = field.createVector(tempAllocator)) { + ColumnBinder columnBinder = ColumnBinder.forVector(tempVector); + return JDBCType.valueOf(columnBinder.getJdbcType()); + } + } + + /** + * jdbc 类型转化为数据库数据类型的字符串形式 + * + * @param jdbcType jdbc 类型 + * @return 数据类型的字符串形式 + */ + String jdbcTypeToDbTypeString(JDBCType jdbcType); + + /** + * 构建索引 + * + * @param indexList + * @return + */ + default String createIndex(List indexList, List fields) { + Map fieldMap = fields.stream().collect(Collectors.toMap(Field::getName, field -> field)); + + StringBuilder stringBuilder = new StringBuilder(); + if (CollectionUtils.isNotEmpty(indexList)) { + for (int i = 0; i < indexList.size(); i++) { + TableIndex index = indexList.get(i); + if (CollectionUtils.isNotEmpty(index.getField())) { + stringBuilder.append(","); + stringBuilder.append(indexKeyword(index.getType())); + stringBuilder.append(" "); + stringBuilder.append(decorateIdentifier("idx_" + i)); + stringBuilder.append(" ("); + stringBuilder.append(index.getField().stream() + .map(fieldName -> { + String decorateIdentifier = decorateIdentifier(fieldName); + Field field = fieldMap.get(fieldName); + + // 字符串类型需要限制索引长度 + if (field.getFieldType().getType().getTypeID() == ArrowType.ArrowTypeID.Utf8 + || field.getFieldType().getType().getTypeID() == ArrowType.ArrowTypeID.LargeUtf8) { + decorateIdentifier = decorateIdentifier + "(128)"; + } + return decorateIdentifier; + }) + .collect(Collectors.joining(","))); + stringBuilder.append(") "); + } + } + } + return stringBuilder.toString(); + } + + /** + * 根据索引类型获取关键词 + * + * @param indexType + * @return + */ + String indexKeyword(IndexType indexType); + + /** + * 获取数据库连接 + * + * @return + */ + default Connection getDatabaseConn(HikariDataSource dataSource) { + try { + return dataSource.getConnection(); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.JDBC_DATASOURCE_CONNECTION_VALIDATE_FAILED, e); + } + } + + /** + * 数据序列化为字符串 + * + * @param value 原始数据 + * @return + */ + default String serialize(JDBCType type, Object value) { + // 文本数据无法区分为空内容还是null,序列化为空内容 + if (value == null) { + return null; + } + + if (value instanceof Double || value instanceof Float || value instanceof Short || value instanceof Byte + || value instanceof Integer || value instanceof Long || value instanceof Boolean + || value instanceof BigDecimal || value instanceof BigInteger) { + return value.toString(); + } + + // 字节数组单独处理 + if (value instanceof byte[]) { + return decorateStrValue(new String((byte[]) value)); + } + if (value instanceof Text) { + return decorateStrValue(value.toString()); + } + if (value instanceof LocalDateTime) { + return decorateStrValue(((LocalDateTime) value).format(new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .append(DateTimeFormatter.ISO_LOCAL_DATE) + .appendLiteral(' ') + .append(DateTimeFormatter.ISO_LOCAL_TIME) + .toFormatter())); + } + + // 兜底,都使用string传输 + return decorateStrValue(value.toString()); + } +} \ No newline at end of file diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcConnector.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcConnector.java new file mode 100644 index 0000000..49af08b --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcConnector.java @@ -0,0 +1,393 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.InferSchemaResult; +import org.secretflow.dataproxy.common.model.command.DatasetReadCommand; +import org.secretflow.dataproxy.common.model.command.DatasetWriteCommand; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.format.*; +import org.secretflow.dataproxy.common.model.datasource.DatasourceTypeEnum; +import org.secretflow.dataproxy.common.model.datasource.conn.JdbcBaseConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.JdbcLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.location.LocationConfig; +import org.secretflow.dataproxy.common.utils.IdUtils; +import org.secretflow.dataproxy.manager.Connector; +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.DataWriter; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowConfig; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowConfigBuilder; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowUtils; + +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.collections4.MapUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.StringSubstitutor; + +import java.math.RoundingMode; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.*; +import java.util.stream.Collectors; + +/** + * @author muhong + * @date 2023-09-07 13:47 + */ +public class JdbcConnector implements Connector { + + protected JdbcBaseConnConfig connConfig; + + protected HikariDataSource dataSource; + + protected JdbcAssistant jdbcAssistant; + + public JdbcConnector() { + } + + public JdbcConnector(DatasourceTypeEnum type, JdbcBaseConnConfig connConfig) { + // 构造jdbc辅助类 + switch (type) { + case MYSQL: + this.jdbcAssistant = new MysqlJdbcAssistant(); + break; + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的jdbc数据源类型 " + type); + } + + HikariConfig config = new HikariConfig(); + config.setUsername(connConfig.getUserName()); + config.setPassword(connConfig.getPassword()); + config.setDriverClassName(this.jdbcAssistant.getDriverClass()); + config.setConnectionTestQuery(this.jdbcAssistant.getConnectionTestQuery()); + config.setMaximumPoolSize(connConfig.getMaximumPoolSize()); + config.setMinimumIdle(connConfig.getMinimumIdle()); + config.addDataSourceProperty("cachePrepStmts", connConfig.getCachePrepStmts()); + config.addDataSourceProperty("useServerPrepStmts", connConfig.getUseServerPrepStmts()); + config.addDataSourceProperty("prepStmtCacheSize", connConfig.getPrepStmtCacheSize()); + config.addDataSourceProperty("prepStmtCacheSqlLimit", connConfig.getPrepStmtCacheSqlLimit()); + config.addDataSourceProperty("allowLoadLocalInfile", "false"); + config.addDataSourceProperty("allowUrlInLocalInfile", "false"); + config.addDataSourceProperty("allowLoadLocalInfileInPath", ""); + config.addDataSourceProperty("autoDeserialize", "false"); + + // 不同数据库对 catalog 和 schema 的使用方法不同,所以交给子类处理 + this.jdbcAssistant.initDataSourceConfig(config, connConfig); + this.connConfig = connConfig; + try { + dataSource = new HikariDataSource(config); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.CREATE_DATASOURCE_CONNECTOR_ERROR, e.getMessage(), e); + } + checkAdaptorStatus(); + } + + /** + * 获取真实字段名 + * + * @param viewFieldName 展示列名 + * @param fieldMap 真实列名-展示列名映射 + * @return 真实列名 + */ + public static String getRawFieldName(String viewFieldName, Map fieldMap) { + // 若列名为空或映射关系为空,直接返回 + if (StringUtils.isEmpty(viewFieldName) || MapUtils.isEmpty(fieldMap)) { + return viewFieldName; + } + + for (Map.Entry entry : fieldMap.entrySet()) { + if (entry.getValue().equals(viewFieldName)) { + return entry.getKey(); + } + } + + // 映射关系中没有的,直接展示列名 + return viewFieldName; + } + + @Override + public InferSchemaResult inferSchema(BufferAllocator allocator, LocationConfig locationConfig, DatasetFormatConfig formatConfig) { + this.jdbcAssistant.fillDefaultValue(connConfig, (JdbcLocationConfig) locationConfig); + String table = ((JdbcLocationConfig) locationConfig).getTable(); + + TableFormatConfig reqFormatConfig = (TableFormatConfig) formatConfig.getFormatConfig(); + TableFormatConfig resultFormatConfig = reqFormatConfig == null ? TableFormatConfig.builder().build() : reqFormatConfig.toBuilder().build(); + + try (Connection conn = this.jdbcAssistant.getDatabaseConn(this.dataSource)) { + + // schema推断 + Schema schema = getSchema(allocator, conn, this.jdbcAssistant.composeTableName((JdbcLocationConfig) locationConfig)); + + // 原始数据字段集合 + Set fieldNameSet = schema.getFields().stream().map(Field::getName).collect(Collectors.toSet()); + Set viewFieldNameSet = new HashSet<>(); + + // 字段映射检查 + if (MapUtils.isNotEmpty(resultFormatConfig.getFieldMap())) { + resultFormatConfig.getFieldMap().forEach((rawFieldName, viewFieldName) -> { + if (!fieldNameSet.contains(rawFieldName)) { + throw DataproxyException.of(DataproxyErrorCode.FIELD_NOT_EXIST, "映射字段 " + rawFieldName); + } + if (viewFieldNameSet.contains(viewFieldName)) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "字段 " + viewFieldName + " 重复"); + } + viewFieldNameSet.add(viewFieldName); + }); + + fieldNameSet.stream() + .filter(fieldName -> !resultFormatConfig.getFieldMap().containsKey(fieldName)) + .forEach(fieldName -> { + if (viewFieldNameSet.contains(fieldName)) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "字段 " + fieldName + " 重复"); + } + viewFieldNameSet.add(fieldName); + resultFormatConfig.getFieldMap().put(fieldName, fieldName); + }); + } else { + resultFormatConfig.setFieldMap(fieldNameSet.stream().collect(Collectors.toMap(name -> name, name -> name))); + viewFieldNameSet.addAll(fieldNameSet); + } + + // 主键推断与校验 + if (StringUtils.isNotEmpty(resultFormatConfig.getPrimaryKey())) { + // 如果用户填写参数中已经包含主键,则校验字段存在性 + if (!fieldNameSet.contains(getRawFieldName(resultFormatConfig.getPrimaryKey(), resultFormatConfig.getFieldMap()))) { + throw DataproxyException.of(DataproxyErrorCode.FIELD_NOT_EXIST, "主键 " + resultFormatConfig.getPrimaryKey()); + } + } else { + String primaryKey = getPrimaryKeyColumnName(conn, table); + resultFormatConfig.setPrimaryKey(primaryKey); + } + + // 索引推断与校验 +// if (CollectionUtils.isNotEmpty(resultFormatConfig.getIndex())) { +// resultFormatConfig.getIndex().forEach(index -> { +// if (CollectionUtils.isEmpty(index.getField())) { +// throw FastDFException.of(DataProxyErrorCode.PARAMS_UNRELIABLE, "索引 " + index.getIndexName() + " 字段为空"); +// } +// if (!viewFieldNameSet.containsAll(index.getField())) { +// throw FastDFException.of(DataProxyErrorCode.PARAMS_UNRELIABLE, "索引 " + index.getIndexName() + " 字段不存在"); +// } +// }); +// } else { +// // 查询索引 +// List indexList = getIndex(conn, table); +// resultFormatConfig.setIndex(indexList); +// } + + // 原先就填写了分页字段,需要做严格校验 + if (resultFormatConfig.getPartitionBehavior() != null && StringUtils.isNotEmpty(resultFormatConfig.getPartitionBehavior().getFieldName())) { + String partitionFieldName = resultFormatConfig.getPartitionBehavior().getFieldName(); + if (!viewFieldNameSet.contains(partitionFieldName)) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "分页字段 " + partitionFieldName + " 不存在"); + } + ArrowType.ArrowTypeID partitionFieldTypeId = schema.findField(partitionFieldName).getFieldType().getType().getTypeID(); + + // 当且仅当主键为整型、浮点型、时间等能做数值计算时,可以作为分页条件 + if (!(ArrowType.ArrowTypeID.Int.equals(partitionFieldTypeId) + || ArrowType.ArrowTypeID.Decimal.equals(partitionFieldTypeId) + || ArrowType.ArrowTypeID.FloatingPoint.equals(partitionFieldTypeId) + || ArrowType.ArrowTypeID.Date.equals(partitionFieldTypeId))) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "分页字段 " + partitionFieldName + " 无法做数值计算"); + } + resultFormatConfig.getPartitionBehavior().setType(partitionFieldTypeId); + } else if (StringUtils.isNotEmpty(resultFormatConfig.getPrimaryKey())) { + // 原先没有填写分页参数,尝试推导 + // 若未指定分页参数,尝试用主键作为分页条件 + ArrowType.ArrowTypeID primaryKeyTypeId = schema.findField(resultFormatConfig.getPrimaryKey()).getFieldType().getType().getTypeID(); + if (ArrowType.ArrowTypeID.Int.equals(primaryKeyTypeId) + || ArrowType.ArrowTypeID.Decimal.equals(primaryKeyTypeId) + || ArrowType.ArrowTypeID.FloatingPoint.equals(primaryKeyTypeId) + || ArrowType.ArrowTypeID.Date.equals(primaryKeyTypeId)) { + resultFormatConfig.setPartitionBehavior(PartitionBehavior.builder() + .fieldName(resultFormatConfig.getPrimaryKey()) + .type(primaryKeyTypeId) + .build()); + } + } else { + resultFormatConfig.setPartitionBehavior(null); + } + + return InferSchemaResult.builder() + .schema(schema) + .datasetFormatConfig(DatasetFormatConfig.builder() + .type(DatasetFormatTypeEnum.TABLE) + .formatConfig(resultFormatConfig) + .build()) + .build(); + } catch (SQLException e) { + throw DataproxyException.of(DataproxyErrorCode.JDBC_CALL_ERROR, "表结构推断失败", e); + } + } + + @Override + public DataReader buildReader(BufferAllocator allocator, DatasetReadCommand readCommand) { + JdbcLocationConfig jdbcLocationConfig = (JdbcLocationConfig) readCommand.getLocationConfig().getLocationConfig(); + this.jdbcAssistant.fillDefaultValue(connConfig, jdbcLocationConfig); + + // 列名缺省处理 + if (CollectionUtils.isEmpty(readCommand.getFieldList())) { + readCommand.setFieldList(readCommand.getSchema().getFields().stream() + .map(Field::getName) + .collect(Collectors.toList()) + ); + } + + return new JdbcDataReader(allocator, + this.jdbcAssistant, + this.dataSource, + (TableFormatConfig) readCommand.getFormatConfig().getFormatConfig(), + readCommand.getOutputFormatConfig(), + this.jdbcAssistant.composeTableName(jdbcLocationConfig), + readCommand.getSchema(), + readCommand.getFieldList(), + readCommand.getFilter()); + } + + @Override + public DataWriter buildWriter(DatasetWriteCommand writeCommand) { + JdbcLocationConfig jdbcLocationConfig = (JdbcLocationConfig) writeCommand.getLocationConfig().getLocationConfig(); + this.jdbcAssistant.fillDefaultValue(connConfig, jdbcLocationConfig); + + if (writeCommand.getFormatConfig().getFormatConfig() == null) { + writeCommand.getFormatConfig().setFormatConfig(TableFormatConfig.builder().build()); + } + TableFormatConfig formatConfig = (TableFormatConfig) writeCommand.getFormatConfig().getFormatConfig(); + + // 未定义主键,需要补充一个 + if (StringUtils.isEmpty(formatConfig.getPrimaryKey())) { + String primaryKey = "pk_" + IdUtils.createRandString(6); + formatConfig.setPrimaryKey(primaryKey); + } + return new JdbcDataWriter(this.jdbcAssistant, + this.dataSource, + this.jdbcAssistant.composeTableName(jdbcLocationConfig), jdbcLocationConfig, + formatConfig, + writeCommand.getSchema()); + } + + @Override + public boolean isAvailable() { + try { + checkAdaptorStatus(); + return true; + } catch (Exception e) { + return false; + } + } + + /** + * 连通性测试 + */ + public void checkAdaptorStatus() { + try (Connection conn = this.jdbcAssistant.getDatabaseConn(dataSource); + PreparedStatement preparedStatement = conn.prepareStatement(this.jdbcAssistant.getConnectionTestQuery())) { + preparedStatement.execute(); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.JDBC_DATASOURCE_CONNECTION_VALIDATE_FAILED, e); + } + } + + protected Schema getSchema(BufferAllocator allocator, Connection conn, String tableName) throws SQLException { + Map valuesMap = new HashMap<>(); + valuesMap.put("sqlPart", "* from " + tableName); + valuesMap.put("limitPart", String.format(this.jdbcAssistant.generateLimitConditionTemplate(false), 1)); + String sampleSql = new StringSubstitutor(valuesMap).replace(this.jdbcAssistant.selectSQLTemplate()); + try (ResultSet sampleRs = conn.createStatement().executeQuery(sampleSql)) { + // 设置浮点型精度规则,四舍五入,保证精度有损场景 + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder() + .setAllocator(allocator) + .setBigDecimalRoundingMode(RoundingMode.CEILING) + .build(); + // schema推断 + return JdbcToArrowUtils.jdbcToArrowSchema(sampleRs.getMetaData(), config); + } + } + + /** + * 获取主键名称
+ *

+ * hive数据库不需要 + * + * @param conn 数据库连接 + * @param tableName 表名 + * @return 主键名称 + * @throws SQLException + */ + protected String getPrimaryKeyColumnName(Connection conn, String tableName) throws SQLException { + String primaryKeyColumnName; + try (ResultSet primaryKeyResultSet = conn.getMetaData().getPrimaryKeys( + dataSource.getCatalog(), dataSource.getSchema(), tableName + )) { + if (!primaryKeyResultSet.next()) { + return null; + } + primaryKeyColumnName = primaryKeyResultSet.getString("COLUMN_NAME"); + if (StringUtils.isEmpty(primaryKeyColumnName)) { + return null; + } + } + return primaryKeyColumnName; + } + + /** + * 索引查询 + * + * @param conn 数据库连接 + * @param tableName 表名 + * @return + * @throws SQLException + */ + protected List getIndex(Connection conn, String tableName) throws SQLException { + Map indexMap = new HashMap<>(); + + try (ResultSet indexResultSet = conn.getMetaData().getIndexInfo( + dataSource.getCatalog(), dataSource.getSchema(), tableName, false, false + )) { + while (indexResultSet.next()) { + String colName = indexResultSet.getString("COLUMN_NAME"); + String indexName = indexResultSet.getString("INDEX_NAME"); + + if (!indexMap.containsKey(indexName)) { + indexMap.put(indexName, TableIndex.builder() + .indexName(indexName) + .type(indexResultSet.getBoolean("NON_UNIQUE") ? IndexType.INDEX : IndexType.UNIQUE) + .field(new ArrayList<>()) + .build()); + } + TableIndex index = indexMap.get(indexName); + index.getField().add(colName); + } + } + return new ArrayList<>(indexMap.values()); + } + + @Override + public void close() throws Exception { + this.dataSource.close(); + } +} \ No newline at end of file diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcDataReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcDataReader.java new file mode 100644 index 0000000..cd8c70d --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcDataReader.java @@ -0,0 +1,208 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.FlightContentFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.format.PartitionBehavior; +import org.secretflow.dataproxy.common.model.dataset.format.TableFormatConfig; +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.SplitReader; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowConfig; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowConfigBuilder; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowUtils; + +import com.zaxxer.hikari.HikariDataSource; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ValueVectorUtility; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.StringSubstitutor; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.MathContext; +import java.math.RoundingMode; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * @author muhong + * @date 2023-09-07 17:57 + */ +public class JdbcDataReader implements DataReader { + + protected static final int PARTITION_STAT_SAMPLE_ROW_COUNT = 100; + private static final long MEMORY_LIMIT = 100 * 1024 * 1024; + /** + * 数据库连接 + */ + private final HikariDataSource dataSource; + private final JdbcAssistant jdbcAssistant; + private final BufferAllocator allocator; + private final TableFormatConfig sourceSchemaConfig; + private final FlightContentFormatConfig outputFormatConfig; + private final List fieldList; + private final String filter; + /** + * 数据源连接配置 + */ + private final String composeTableName; + private final Schema schema; + + + public JdbcDataReader(BufferAllocator allocator, + JdbcAssistant jdbcAssistant, + HikariDataSource dataSource, + TableFormatConfig sourceSchemaConfig, + FlightContentFormatConfig outputFormatConfig, + String composeTableName, + Schema schema, + List fieldList, + String filter) { + + this.allocator = allocator; + this.jdbcAssistant = jdbcAssistant; + this.dataSource = dataSource; + this.schema = schema; + this.sourceSchemaConfig = sourceSchemaConfig; + this.fieldList = fieldList; + this.filter = filter; + this.outputFormatConfig = outputFormatConfig; + this.composeTableName = composeTableName; + } + + @Override + public List createSplitReader(int splitNumber) { + fillPartitionBehavior(); + + // todo: 根据splitNumber与partitionBehavior分区 + return Arrays.asList(new JdbcSplitReader(allocator, jdbcAssistant, outputFormatConfig, dataSource, composeTableName, schema, this.sourceSchemaConfig.getPartitionBehavior(), fieldList, filter)); + } + + private void fillPartitionBehavior() { + try { + PartitionBehavior partitionBehavior = sourceSchemaConfig.getPartitionBehavior(); + if (partitionBehavior == null || StringUtils.isEmpty(partitionBehavior.getFieldName())) { + return; + } + + String partitionField = this.jdbcAssistant.decorateIdentifier(partitionBehavior.getFieldName()); + + try (Connection conn = this.jdbcAssistant.getDatabaseConn(dataSource)) { + + // 设置最大值 + if (StringUtils.isEmpty(partitionBehavior.getUpperBound())) { + Map valuesMap = new HashMap<>(); + valuesMap.put("sqlPart", "max(" + partitionField + ") from " + composeTableName); + valuesMap.put("limitPart", ""); + String maxPkSql = new StringSubstitutor(valuesMap).replace(this.jdbcAssistant.selectSQLTemplate()); + try (ResultSet maxPkRs = conn.createStatement().executeQuery(maxPkSql)) { + if (!maxPkRs.next()) { + return; + } + String maxStr = maxPkRs.getObject(1).toString(); + partitionBehavior.setUpperBound(maxStr); + } + } + + // 设置最小值 + if (StringUtils.isEmpty(partitionBehavior.getLowerBound())) { + Map valuesMap = new HashMap<>(); + valuesMap.put("sqlPart", "min(" + partitionField + ") from " + composeTableName); + valuesMap.put("limitPart", ""); + String minPkSql = new StringSubstitutor(valuesMap).replace(this.jdbcAssistant.selectSQLTemplate()); + try (ResultSet minPkRs = conn.createStatement().executeQuery(minPkSql)) { + if (!minPkRs.next()) { + return; + } + String minStr = minPkRs.getObject(1).toString(); + partitionBehavior.setLowerBound(minStr); + } + } + + // 步进间隔估计 + if (StringUtils.isEmpty(partitionBehavior.getStep())) { + Map valuesMap = new HashMap<>(); + + // 计算总数 + long count = 0; + valuesMap.put("sqlPart", "count(*) from " + composeTableName); + valuesMap.put("limitPart", ""); + String countSql = new StringSubstitutor(valuesMap).replace(this.jdbcAssistant.selectSQLTemplate()); + try (ResultSet countRs = conn.createStatement().executeQuery(countSql)) { + countRs.next(); + count = countRs.getLong(1); + } + // 数据集中没有数据,则直接返回 + if (count == 0) { + return; + } + + valuesMap.put("sqlPart", this.jdbcAssistant.createFieldPart(this.fieldList, this.composeTableName)); + valuesMap.put("limitPart", String.format( + this.jdbcAssistant.generateLimitConditionTemplate(false), PARTITION_STAT_SAMPLE_ROW_COUNT + )); + String sampleSql = new StringSubstitutor(valuesMap).replace(this.jdbcAssistant.selectSQLTemplate()); + try (ResultSet sampleRs = conn.createStatement().executeQuery(sampleSql)) { + // 将示例数据转换为arrow + int sampleRowCount = 0; + long sampleDataSize = 0; + + // 设置浮点型精度规则,四舍五入,保证精度有损场景 + JdbcToArrowConfig config = new JdbcToArrowConfigBuilder() + .setAllocator(allocator) + .setTargetBatchSize(PARTITION_STAT_SAMPLE_ROW_COUNT) + .setBigDecimalRoundingMode(RoundingMode.CEILING) + .build(); + + try (VectorSchemaRoot root = VectorSchemaRoot.create( + JdbcToArrowUtils.jdbcToArrowSchema(sampleRs.getMetaData(), config), config.getAllocator())) { + if (config.getTargetBatchSize() != JdbcToArrowConfig.NO_LIMIT_BATCH_SIZE) { + ValueVectorUtility.preAllocate(root, config.getTargetBatchSize()); + } + JdbcToArrowUtils.jdbcToArrowVectors(sampleRs, root, config); + // 总空间大小为每列求和 + sampleRowCount = root.getRowCount(); + sampleDataSize = root.getFieldVectors().stream().map(ValueVector::getBufferSize).reduce(0, Integer::sum); + } + + // 步进间隔 = (最大值 - 最小值) / ((采样大小 / 采样行数) * 总行数 / 期望内存 + 1) + BigDecimal dataInterval = new BigDecimal(partitionBehavior.getUpperBound()).divide(new BigDecimal(partitionBehavior.getLowerBound())); + if (dataInterval.compareTo(BigDecimal.ZERO) == 0) { + partitionBehavior.setStep("1"); + } else { + int partitionSize = (int) Math.ceil((double) sampleDataSize / sampleRowCount * count / MEMORY_LIMIT); + BigDecimal step = dataInterval.divide(new BigDecimal(partitionSize), MathContext.DECIMAL32); + partitionBehavior.setStep(step.toString()); + } + } + } + } + } catch (SQLException | IOException e) { + throw DataproxyException.of(DataproxyErrorCode.JDBC_GET_PARTITION_STATS_FAILED, e); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcDataWriter.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcDataWriter.java new file mode 100644 index 0000000..cde59d9 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcDataWriter.java @@ -0,0 +1,193 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms; + +import com.zaxxer.hikari.HikariDataSource; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.Schema; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.dataset.format.TableFormatConfig; +import org.secretflow.dataproxy.common.model.datasource.location.JdbcLocationConfig; +import org.secretflow.dataproxy.common.utils.JsonUtils; +import org.secretflow.dataproxy.manager.DataWriter; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcParameterBinder; + +import java.io.IOException; +import java.sql.*; +import java.util.Arrays; +import java.util.List; + +/** + * jdbc数据源写入 + * + * @author muhong + * @date 2023-09-08 15:37 + */ +@Slf4j +public class JdbcDataWriter implements DataWriter { + + /** + * 数据库连接 + */ + protected HikariDataSource dataSource; + + /** + * 表名 + */ + protected String composeTableName; + + protected JdbcAssistant jdbcAssistant; + + protected JdbcLocationConfig locationConfig; + + protected TableFormatConfig formatConfig; + + //the statement in the format of either merge into or insert into sql statement + protected String stmt; + + private boolean initialized; + + public JdbcDataWriter() { + } + + public JdbcDataWriter(JdbcAssistant jdbcAssistant, HikariDataSource dataSource, String composeTableName, JdbcLocationConfig locationConfig, TableFormatConfig formatConfig, Schema schema) { + this.jdbcAssistant = jdbcAssistant; + this.dataSource = dataSource; + this.initialized = false; + this.formatConfig = formatConfig; + this.locationConfig = locationConfig; + + this.composeTableName = composeTableName; + + ensureInitialized(schema); + } + + protected void ensureInitialized(Schema schema) { + if (!this.initialized) { + this.initialize(schema); + this.initialized = true; + } + } + + protected void initialize(Schema schema) { + List preSqlList = this.jdbcAssistant.preWorkSqls(this.composeTableName, schema, this.locationConfig, this.formatConfig); + log.info("[JdbcDataWriter] preSql execute start, sql: {}", JsonUtils.toJSONString(preSqlList)); + + try (Connection conn = this.jdbcAssistant.getDatabaseConn(dataSource)) { + executePreWorkSqls(conn, preSqlList); + } catch (SQLException e) { + throw DataproxyException.of(DataproxyErrorCode.JDBC_CREATE_TABLE_FAILED, e.getMessage(), e); + } + + // 构造sql预提交模板 + this.stmt = String.format("insert into %s(%s) values(%s)", composeTableName, + String.join(",", schema.getFields().stream().map(field -> this.jdbcAssistant.decorateIdentifier(field.getName())).toArray(String[]::new)), + String.join(",", schema.getFields().stream().map(field -> "?").toArray(String[]::new))); + } + + @Override + public void write(VectorSchemaRoot root) throws IOException { + ensureInitialized(root.getSchema()); + + // 每次直接发送,不积攒 + final int rowCount = root.getRowCount(); + int recordCount = 0; + + try (Connection conn = this.jdbcAssistant.getDatabaseConn(dataSource)) { + boolean finished = false; + + if (this.jdbcAssistant.supportBatchInsert()) { + try (PreparedStatement preparedStatement = conn.prepareStatement(this.stmt)) { + if (rowCount != 0) { + final JdbcParameterBinder binder = JdbcParameterBinder.builder(preparedStatement, root).bindAll().build(); + while (binder.next()) { + preparedStatement.addBatch(); + } + int[] recordCounts = preparedStatement.executeBatch(); + recordCount = Arrays.stream(recordCounts).sum(); + } + finished = true; + } catch (Exception e) { + log.warn("[JdbcDataWriter] prepare batch write error, then dp will try to generate integral insert sql, stmt:{}", this.stmt, e); + } + } + + // 不支持prepare模式,需要构造完整insert语句 + //insert into `default`.`test_table`(`int32`,`float64`,`string`) values(?,?,?) + if (!finished) { + String insertSql = null; + List jdbcTypes = root.getFieldVectors().stream() + .map(vector -> this.jdbcAssistant.arrowTypeToJdbcType(vector.getField())) + .toList(); + + try (Statement statement = conn.createStatement()) { + // 数据逐行写入 + for (int row = 0; row < root.getRowCount(); row++) { + String[] values = new String[root.getFieldVectors().size()]; + for (int col = 0; col < root.getFieldVectors().size(); col++) { + values[col] = this.jdbcAssistant.serialize(jdbcTypes.get(col), root.getVector(col).getObject(row)); + } + + insertSql = String.format(this.stmt.replace("?", "%s"), (Object[]) values); + statement.execute(insertSql); + } + } catch (Exception e) { + log.error("[JdbcDataWriter] integral insert sql error, sql:{}", insertSql, e); + throw e; + } + } + + log.info("[JdbcDataWriter] jdbc batch write success, record count:{}, table:{}", recordCount, this.composeTableName); + } catch (Exception e) { + log.error("[JdbcDataWriter] jdbc batch write failed, table:{}", this.composeTableName); + throw DataproxyException.of(DataproxyErrorCode.JDBC_INSERT_INTO_TABLE_FAILED, e); + } + } + + @Override + public void flush() throws IOException { + + } + + @Override + public void destroy() throws IOException { + + } + + @Override + public void close() throws Exception { + try { + if (this.dataSource != null) { + this.dataSource.close(); + } + } catch (Exception ignored) { + } + } + + void executePreWorkSqls(Connection conn, List preWorkSqls) throws SQLException { + for (String sql : preWorkSqls) { + try (Statement statement = conn.createStatement()) { + statement.execute(sql); + } catch (SQLException e) { + log.error("[SinkJdbcHandler] 数据转移前预先执行SQL失败:{}", sql); + throw e; + } + } + } +} \ No newline at end of file diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcSplitReader.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcSplitReader.java new file mode 100644 index 0000000..c63e4fe --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/JdbcSplitReader.java @@ -0,0 +1,256 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.FlightContentFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.format.PartitionBehavior; +import org.secretflow.dataproxy.common.utils.JsonUtils; +import org.secretflow.dataproxy.manager.SplitReader; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowConfig; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowConfigBuilder; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.JdbcToArrowUtils; + +import com.zaxxer.hikari.HikariDataSource; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.StringSubstitutor; + +import java.io.IOException; +import java.math.RoundingMode; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * @author muhong + * @date 2023-09-07 19:47 + */ +@Slf4j +public class JdbcSplitReader extends ArrowReader implements SplitReader { + + private final PartitionBehavior partitionBehavior; + /** + * 数据库连接 + */ + private HikariDataSource dataSource; + /** + * 表名 + */ + private String composeTableName; + private JdbcAssistant jdbcAssistant; + private FlightContentFormatConfig outputFormatConfig; + + private List fieldList; + + private String filter; + + private Schema schema; + + private String currentPartition = null; + + private int currentSize = 0; + + private String sqlPartTemplate; + private JdbcToArrowConfig config; + + public JdbcSplitReader(BufferAllocator allocator, + JdbcAssistant jdbcAssistant, + FlightContentFormatConfig outputFormatConfig, + HikariDataSource dataSource, + String composeTableName, + Schema schema, + PartitionBehavior partitionBehavior, + List fieldList, + String filter) { + super(allocator); + this.jdbcAssistant = jdbcAssistant; + this.dataSource = dataSource; + this.composeTableName = composeTableName; + this.partitionBehavior = partitionBehavior; + this.fieldList = fieldList; + this.filter = filter; + this.outputFormatConfig = outputFormatConfig; + this.schema = schema; + + this.config = new JdbcToArrowConfigBuilder() + .setAllocator(allocator) + .setTargetBatchSize(JdbcToArrowConfig.NO_LIMIT_BATCH_SIZE) + // 设置浮点型精度规则,四舍五入,保证精度有损场景 + .setBigDecimalRoundingMode(RoundingMode.CEILING) + .build(); + + // 若指定读取顺序则调整schema + if (CollectionUtils.isNotEmpty(fieldList)) { + this.schema = new Schema(fieldList.stream().map(schema::findField).collect(Collectors.toList())); + } else { + this.schema = schema; + } + } + + private static void preAllocate(VectorSchemaRoot root, int targetSize) { + for (ValueVector vector : root.getFieldVectors()) { + if (vector instanceof BaseFixedWidthVector) { + ((BaseFixedWidthVector) vector).allocateNew(targetSize); + } + } + } + + @Override + public ArrowReader startRead() { + + String fieldPart = this.jdbcAssistant.createFieldPart(this.fieldList, this.composeTableName); + + StringBuilder selectSqlBuilder = new StringBuilder().append(fieldPart); + + // sql模板替换参数列表 + if (this.partitionBehavior != null && this.partitionBehavior.isValid()) { + log.info("[startRead] partitionBehavior: {}", JsonUtils.toJSONString(this.partitionBehavior)); + String partitionFieldName = this.jdbcAssistant.decorateIdentifier(this.partitionBehavior.getFieldName()); + selectSqlBuilder.append("where ") + .append(partitionFieldName) + .append(">= %s and ") + .append(partitionFieldName) + .append("< %s"); + } + this.sqlPartTemplate = selectSqlBuilder.toString(); + return this; + } + + @Override + public boolean loadNextBatch() throws IOException { + VectorSchemaRoot root = getVectorSchemaRoot(); + root.clear(); + + String sql = generateNextSql(); + if (StringUtils.isEmpty(sql)) { + return false; + } + + try (Connection conn = getDatabaseConn(); + ResultSet resultSet = conn.createStatement().executeQuery(sql)) { + JdbcToArrowUtils.jdbcToArrowVectors(resultSet, getVectorSchemaRoot(), this.config); + } catch (SQLException e) { + throw DataproxyException.of(DataproxyErrorCode.JDBC_FETCH_BATCH_DATA_FAILED, e); + } + return true; + } + + @Override + public long bytesRead() { + return 0; + } + + @Override + protected void closeReadSource() throws IOException { + close(); + } + + @Override + protected Schema readSchema() { + return this.schema; + } + + @Override + public void close() { + try { + if (this.dataSource != null) { + this.dataSource.close(); + } + } catch (Exception ignored) { + } + } + + /** + * 获取数据库连接 + * + * @return + */ + protected Connection getDatabaseConn() { + try { + return dataSource.getConnection(); + } catch (Exception e) { + throw DataproxyException.of(DataproxyErrorCode.JDBC_GET_CONN_THREAD_FAILED, e); + } + } + + // 生成下一条查询sql + private String generateNextSql() { + String currentLowerBoundStr = null; + String currentUpperBoundStr = null; + + // 分页规则 + if (this.partitionBehavior != null && this.partitionBehavior.isValid()) { + String current = StringUtils.isEmpty(this.currentPartition) ? this.partitionBehavior.getLowerBound() : this.currentPartition; + + switch (this.partitionBehavior.getType()) { + case Int: { + Long currentLowerBound = Long.valueOf(current); + if (currentLowerBound > Long.valueOf(this.partitionBehavior.getUpperBound())) { + return null; + } + + Long currentUpperBound = currentLowerBound + (long) Math.ceil(Double.parseDouble(this.partitionBehavior.getStep())); + + currentLowerBoundStr = String.valueOf(currentLowerBound); + currentUpperBoundStr = String.valueOf(currentUpperBound); + break; + } + case FloatingPoint: + case Decimal: { + Double currentLowerBound = Double.valueOf(current); + if (currentLowerBound > Double.valueOf(this.partitionBehavior.getUpperBound())) { + return null; + } + + Double currentUpperBound = currentLowerBound + Double.parseDouble(this.partitionBehavior.getStep()); + + currentLowerBoundStr = String.valueOf(currentLowerBound); + currentUpperBoundStr = String.valueOf(currentUpperBound); + break; + } + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "分页字段 " + this.partitionBehavior.getFieldName() + " 无法做数值计算"); + } + } else { + if (this.currentSize > 0) { + return null; + } + } + + Map valuesMap = new HashMap<>(); + valuesMap.put("sqlPart", String.format(this.sqlPartTemplate, currentLowerBoundStr, currentUpperBoundStr)); + valuesMap.put("limitPart", ""); + String execSql = new StringSubstitutor(valuesMap).replace(this.jdbcAssistant.selectSQLTemplate()); + + this.currentPartition = currentUpperBoundStr; + this.currentSize++; + return execSql; + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/MysqlJdbcAssistant.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/MysqlJdbcAssistant.java new file mode 100644 index 0000000..bb0b8d6 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/MysqlJdbcAssistant.java @@ -0,0 +1,116 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.dataset.format.IndexType; +import org.secretflow.dataproxy.common.model.dataset.format.TableFormatConfig; +import org.secretflow.dataproxy.common.model.datasource.conn.MysqlConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.MysqlLocationConfig; + +import com.zaxxer.hikari.HikariConfig; +import org.apache.arrow.vector.types.pojo.Schema; + +import java.sql.JDBCType; +import java.util.ArrayList; +import java.util.List; + +/** + * @author muhong + * @date 2023-09-07 19:56 + */ +public class MysqlJdbcAssistant implements JdbcAssistant { + private static final String MYSQL_JDBC_URL_PREFIX = "jdbc:mysql://"; + private static final String MYSQL_CONNECTION_TEST_QUERY = "SELECT 1 FROM DUAL"; + private static final String MYSQL_DRIVER_CLASS_NAME = "com.mysql.cj.jdbc.Driver"; + + @Override + public String getConnectionTestQuery() { + return MYSQL_CONNECTION_TEST_QUERY; + } + + @Override + public String getDriverClass() { + return MYSQL_DRIVER_CLASS_NAME; + } + + @Override + public void initDataSourceConfig(HikariConfig config, MysqlConnConfig connConfig) { + config.setJdbcUrl(MYSQL_JDBC_URL_PREFIX + connConfig.getHost()); + config.setCatalog(connConfig.getDatabase()); + } + + @Override + public String decorateIdentifier(String identifier) { + return "`" + identifier + "`"; + } + + @Override + public String composeTableName(MysqlLocationConfig locationConfig) { + return decorateIdentifier(locationConfig.getTable()); + } + + @Override + public String jdbcTypeToDbTypeString(JDBCType jdbcType) { + return switch (jdbcType) { + case TINYINT -> "TINYINT"; + case SMALLINT -> "SMALLINT"; + case INTEGER -> "INT"; + case BIGINT -> "BIGINT"; + case REAL -> "REAL"; + case FLOAT -> "FLOAT"; + case DOUBLE -> "DOUBLE"; + case DECIMAL -> "DECIMAL"; + case BOOLEAN -> "BOOLEAN"; + case DATE -> "DATE"; + case TIME -> "TIME"; + case TIMESTAMP, TIMESTAMP_WITH_TIMEZONE -> "TIMESTAMP DEFAULT '2000-01-01 00:00:00'"; + case VARCHAR -> "TEXT"; + case LONGVARCHAR -> "LONGTEXT"; + case BINARY, VARBINARY -> "BLOB"; + case LONGVARBINARY -> "LONGBLOB"; + default -> throw DataproxyException.of(DataproxyErrorCode.UNSUPPORTED_FIELD_TYPE, jdbcType.name()); + }; + } + + @Override + public String indexKeyword(IndexType indexType) { + switch (indexType) { + case UNIQUE: + return "UNIQUE KEY"; + case INDEX: + return "INDEX"; + default: + throw DataproxyException.of(DataproxyErrorCode.UNSUPPORTED_INDEX_TYPE, indexType.name()); + } + } + + @Override + public List preWorkSqls(String composeTableName, Schema schema, MysqlLocationConfig locationConfig, TableFormatConfig formatConfig) { + List preWorkSqls = new ArrayList<>(); + preWorkSqls.add(dropTableSql(composeTableName)); + String createTabelSql = "CREATE TABLE " + composeTableName + " (" + + createTableColumnTypes(schema.getFields(), formatConfig) + + "," + decorateIdentifier(formatConfig.getPrimaryKey()) + + " BIGINT PRIMARY KEY NOT NULL AUTO_INCREMENT" + + createIndex(formatConfig.getIndexList(), schema.getFields()) + + ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4"; + preWorkSqls.add(createTabelSql); + return preWorkSqls; + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/ArrowVectorIterator.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/ArrowVectorIterator.java new file mode 100644 index 0000000..404a89a --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/ArrowVectorIterator.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor; + +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer.CompositeJdbcConsumer; +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer.JdbcConsumer; + +import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ValueVectorUtility; + +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.util.Iterator; + + +/** + * VectorSchemaRoot iterator for partially converting JDBC data. + */ +public class ArrowVectorIterator implements Iterator, AutoCloseable { + + final CompositeJdbcConsumer compositeConsumer; + private final ResultSet resultSet; + private final JdbcToArrowConfig config; + private final Schema schema; + private final ResultSetMetaData rsmd; + private final JdbcConsumer[] consumers; + private final int targetBatchSize; + // this is used only if resuing vector schema root is enabled. + private VectorSchemaRoot nextBatch; + // This is used to track whether the ResultSet has been fully read, and is needed spcifically for cases where there + // is a ResultSet having zero rows (empty): + private boolean readComplete = false; + + /** + * Construct an instance. + */ + private ArrowVectorIterator(ResultSet resultSet, JdbcToArrowConfig config) throws SQLException { + this.resultSet = resultSet; + this.config = config; + this.schema = JdbcToArrowUtils.jdbcToArrowSchema(resultSet.getMetaData(), config); + this.targetBatchSize = config.getTargetBatchSize(); + + rsmd = resultSet.getMetaData(); + consumers = new JdbcConsumer[rsmd.getColumnCount()]; + this.compositeConsumer = new CompositeJdbcConsumer(consumers); + this.nextBatch = config.isReuseVectorSchemaRoot() ? createVectorSchemaRoot() : null; + } + + /** + * Create a ArrowVectorIterator to partially convert data. + */ + public static ArrowVectorIterator create( + ResultSet resultSet, + JdbcToArrowConfig config) + throws SQLException { + ArrowVectorIterator iterator = null; + try { + iterator = new ArrowVectorIterator(resultSet, config); + } catch (Throwable e) { + AutoCloseables.close(e, iterator); + throw new RuntimeException("Error occurred while creating iterator.", e); + } + return iterator; + } + + private void consumeData(VectorSchemaRoot root) { + // consume data + try { + int readRowCount = 0; + if (targetBatchSize == JdbcToArrowConfig.NO_LIMIT_BATCH_SIZE) { + while (resultSet.next()) { + ValueVectorUtility.ensureCapacity(root, readRowCount + 1); + compositeConsumer.consume(resultSet); + readRowCount++; + } + readComplete = true; + } else { + while ((readRowCount < targetBatchSize) && !readComplete) { + if (resultSet.next()) { + compositeConsumer.consume(resultSet); + readRowCount++; + } else { + readComplete = true; + } + } + } + + root.setRowCount(readRowCount); + } catch (Throwable e) { + compositeConsumer.close(); + throw new RuntimeException("Error occurred while consuming data.", e); + } + } + + private VectorSchemaRoot createVectorSchemaRoot() throws SQLException { + VectorSchemaRoot root = null; + try { + root = VectorSchemaRoot.create(schema, config.getAllocator()); + if (config.getTargetBatchSize() != JdbcToArrowConfig.NO_LIMIT_BATCH_SIZE) { + ValueVectorUtility.preAllocate(root, config.getTargetBatchSize()); + } + } catch (Throwable e) { + if (root != null) { + root.close(); + } + throw new RuntimeException("Error occurred while creating schema root.", e); + } + initialize(root); + return root; + } + + private void initialize(VectorSchemaRoot root) throws SQLException { + for (int i = 1; i <= consumers.length; i++) { + final JdbcFieldInfo columnFieldInfo = JdbcToArrowUtils.getJdbcFieldInfoForColumn(rsmd, i, config); + ArrowType arrowType = config.getJdbcToArrowTypeConverter().apply(columnFieldInfo); + consumers[i - 1] = JdbcToArrowUtils.getConsumer( + arrowType, i, JdbcToArrowUtils.isColumnNullable(resultSet.getMetaData(), i, columnFieldInfo), root.getVector(i - 1), config); + } + } + + // Loads the next schema root or null if no more rows are available. + private void load(VectorSchemaRoot root) { + for (int i = 0; i < consumers.length; i++) { + FieldVector vec = root.getVector(i); + if (config.isReuseVectorSchemaRoot()) { + // if we are reusing the vector schema root, + // we must reset the vector before populating it with data. + vec.reset(); + } + consumers[i].resetValueVector(vec); + } + + consumeData(root); + } + + @Override + public boolean hasNext() { + return !readComplete; + } + + /** + * Gets the next vector. + * If {@link JdbcToArrowConfig#isReuseVectorSchemaRoot()} is false, + * the client is responsible for freeing its resources. + */ + @Override + public VectorSchemaRoot next() { + Preconditions.checkArgument(hasNext()); + try { + VectorSchemaRoot ret = config.isReuseVectorSchemaRoot() ? nextBatch : createVectorSchemaRoot(); + load(ret); + return ret; + } catch (Exception e) { + close(); + throw new RuntimeException("Error occurred while getting next schema root.", e); + } + } + + /** + * Clean up resources ONLY WHEN THE {@link VectorSchemaRoot} HOLDING EACH BATCH IS REUSED. If a new VectorSchemaRoot + * is created for each batch, each root must be closed manually by the client code. + */ + @Override + public void close() { + if (config.isReuseVectorSchemaRoot()) { + nextBatch.close(); + compositeConsumer.close(); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/Constants.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/Constants.java new file mode 100644 index 0000000..6c58064 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/Constants.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor; + +/** + * String constants used for metadata returned on Vectors. + */ +public class Constants { + public static final String SQL_CATALOG_NAME_KEY = "SQL_CATALOG_NAME"; + public static final String SQL_SCHEMA_NAME_KEY = "SQL_SCHEMA_NAME"; + public static final String SQL_TABLE_NAME_KEY = "SQL_TABLE_NAME"; + public static final String SQL_COLUMN_NAME_KEY = "SQL_COLUMN_NAME"; + public static final String SQL_TYPE_KEY = "SQL_TYPE"; + private Constants() { + } + +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcFieldInfo.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcFieldInfo.java new file mode 100644 index 0000000..1ade6dd --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcFieldInfo.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor; + +import org.apache.arrow.util.Preconditions; + +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.sql.Types; + +/** + * This class represents the information about a JDBC ResultSet Field that is + * needed to construct an {@link org.apache.arrow.vector.types.pojo.ArrowType}. + * Currently, this is: + *

    + *
  • The JDBC {@link Types} type.
  • + *
  • The nullability.
  • + *
  • The field's precision (used for {@link Types#DECIMAL} and {@link Types#NUMERIC} types).
  • + *
  • The field's scale (used for {@link Types#DECIMAL} and {@link Types#NUMERIC} types).
  • + *
+ */ +public class JdbcFieldInfo { + private final int column; + private final int jdbcType; + private final int nullability; + private final int precision; + private final int scale; + + /** + * Builds a JdbcFieldInfo using only the {@link Types} type. Do not use this constructor + * if the field type is {@link Types#DECIMAL} or {@link Types#NUMERIC}; the precision and + * scale will be set to 0. + * + * @param jdbcType The {@link Types} type. + * @throws IllegalArgumentException if jdbcType is {@link Types#DECIMAL} or {@link Types#NUMERIC}. + */ + public JdbcFieldInfo(int jdbcType) { + Preconditions.checkArgument( + (jdbcType != Types.DECIMAL && jdbcType != Types.NUMERIC), + "DECIMAL and NUMERIC types require a precision and scale; please use another constructor."); + + this.column = 0; + this.jdbcType = jdbcType; + this.nullability = ResultSetMetaData.columnNullableUnknown; + this.precision = 0; + this.scale = 0; + } + + /** + * Builds a JdbcFieldInfo from the {@link Types} type, precision, and scale. + * Use this constructor for {@link Types#DECIMAL} and {@link Types#NUMERIC} types. + * + * @param jdbcType The {@link Types} type. + * @param precision The field's numeric precision. + * @param scale The field's numeric scale. + */ + public JdbcFieldInfo(int jdbcType, int precision, int scale) { + this.column = 0; + this.jdbcType = jdbcType; + this.nullability = ResultSetMetaData.columnNullableUnknown; + this.precision = precision; + this.scale = scale; + } + + /** + * Builds a JdbcFieldInfo from the {@link Types} type, nullability, precision, and scale. + * + * @param jdbcType The {@link Types} type. + * @param nullability The nullability. Must be one of {@link ResultSetMetaData#columnNoNulls}, + * {@link ResultSetMetaData#columnNullable}, or {@link ResultSetMetaData#columnNullableUnknown}. + * @param precision The field's numeric precision. + * @param scale The field's numeric scale. + */ + public JdbcFieldInfo(int jdbcType, int nullability, int precision, int scale) { + this.column = 0; + this.jdbcType = jdbcType; + this.nullability = nullability; + this.precision = precision; + this.scale = scale; + } + + /** + * Builds a JdbcFieldInfo from the corresponding {@link ResultSetMetaData} column. + * + * @param rsmd The {@link ResultSetMetaData} to get the field information from. + * @param column The column to get the field information for (on a 1-based index). + * @throws SQLException If the column information cannot be retrieved. + * @throws NullPointerException if rsmd is null. + * @throws IllegalArgumentException if column is out of bounds. + */ + public JdbcFieldInfo(ResultSetMetaData rsmd, int column) throws SQLException { + Preconditions.checkNotNull(rsmd, "ResultSetMetaData cannot be null."); + Preconditions.checkArgument(column > 0, "ResultSetMetaData columns have indices starting at 1."); + Preconditions.checkArgument( + column <= rsmd.getColumnCount(), + "The index must be within the number of columns (1 to %s, inclusive)", rsmd.getColumnCount()); + + this.column = column; + this.jdbcType = rsmd.getColumnType(column); + this.nullability = rsmd.isNullable(column); + this.precision = rsmd.getPrecision(column); + this.scale = rsmd.getScale(column); + } + + /** + * The {@link Types} type. + */ + public int getJdbcType() { + return jdbcType; + } + + /** + * The nullability. + */ + public int isNullable() { + return nullability; + } + + /** + * The numeric precision, for {@link Types#NUMERIC} and {@link Types#DECIMAL} types. + */ + public int getPrecision() { + return precision; + } + + /** + * The numeric scale, for {@link Types#NUMERIC} and {@link Types#DECIMAL} types. + */ + public int getScale() { + return scale; + } + + /** + * The column index for query column. + */ + public int getColumn() { + return column; + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcParameterBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcParameterBinder.java new file mode 100644 index 0000000..b1430f3 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcParameterBinder.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor; + +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder.ColumnBinder; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.VectorSchemaRoot; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.Map; + +/** + * A binder binds JDBC prepared statement parameters to rows of Arrow data from a VectorSchemaRoot. + *

+ * Each row of the VectorSchemaRoot will be bound to the configured parameters of the PreparedStatement. + * One row of data is bound at a time. + */ +public class JdbcParameterBinder { + private final PreparedStatement statement; + private final VectorSchemaRoot root; + private final ColumnBinder[] binders; + private final int[] parameterIndices; + private int nextRowIndex; + + /** + * Create a new parameter binder. + * + * @param statement The statement to bind parameters to. + * @param root The VectorSchemaRoot to pull data from. + * @param binders Column binders to translate from Arrow data to JDBC parameters, one per parameter. + * @param parameterIndices For each binder in binders, the index of the parameter to bind to. + */ + private JdbcParameterBinder( + final PreparedStatement statement, + final VectorSchemaRoot root, + final ColumnBinder[] binders, + int[] parameterIndices) { + Preconditions.checkArgument( + binders.length == parameterIndices.length, + "Number of column binders (%s) must equal number of parameter indices (%s)", + binders.length, parameterIndices.length); + this.statement = statement; + this.root = root; + this.binders = binders; + this.parameterIndices = parameterIndices; + this.nextRowIndex = 0; + } + + /** + * Initialize a binder with a builder. + * + * @param statement The statement to bind to. The binder does not maintain ownership of the statement. + * @param root The {@link VectorSchemaRoot} to pull data from. The binder does not maintain ownership + * of the vector schema root. + */ + public static Builder builder(final PreparedStatement statement, final VectorSchemaRoot root) { + return new Builder(statement, root); + } + + /** + * Reset the binder (so the root can be updated with new data). + */ + public void reset() { + nextRowIndex = 0; + } + + /** + * Bind the next row of data to the parameters of the statement. + *

+ * After this, the application should call the desired method on the prepared statement, + * such as {@link PreparedStatement#executeUpdate()}, or {@link PreparedStatement#addBatch()}. + * + * @return true if a row was bound, false if rows were exhausted + */ + public boolean next() throws SQLException { + if (nextRowIndex >= root.getRowCount()) { + return false; + } + for (int i = 0; i < parameterIndices.length; i++) { + final int parameterIndex = parameterIndices[i]; + binders[i].bind(statement, parameterIndex, nextRowIndex); + } + nextRowIndex++; + return true; + } + + /** + * A builder for a {@link JdbcParameterBinder}. + */ + public static class Builder { + private final PreparedStatement statement; + private final VectorSchemaRoot root; + private final Map bindings; + + Builder(PreparedStatement statement, VectorSchemaRoot root) { + this.statement = statement; + this.root = root; + this.bindings = new HashMap<>(); + } + + /** + * Bind each column to the corresponding parameter in order. + */ + public Builder bindAll() { + for (int i = 0; i < root.getFieldVectors().size(); i++) { + bind(/*parameterIndex=*/ i + 1, /*columnIndex=*/ i); + } + return this; + } + + /** + * Bind the given parameter to the given column using the default binder. + */ + public Builder bind(int parameterIndex, int columnIndex) { + return bind( + parameterIndex, + ColumnBinder.forVector(root.getVector(columnIndex))); + } + + /** + * Bind the given parameter using the given binder. + */ + public Builder bind(int parameterIndex, ColumnBinder binder) { + Preconditions.checkArgument( + parameterIndex > 0, "parameterIndex %d must be positive", parameterIndex); + bindings.put(parameterIndex, binder); + return this; + } + + /** + * Build the binder. + */ + public JdbcParameterBinder build() { + ColumnBinder[] binders = new ColumnBinder[bindings.size()]; + int[] parameterIndices = new int[bindings.size()]; + int index = 0; + for (Map.Entry entry : bindings.entrySet()) { + binders[index] = entry.getValue(); + parameterIndices[index] = entry.getKey(); + index++; + } + return new JdbcParameterBinder(statement, root, binders, parameterIndices); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrow.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrow.java new file mode 100644 index 0000000..1ff07fb --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrow.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; + +import java.io.IOException; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Utility class to convert JDBC objects to columnar Arrow format objects. + * + *

This utility uses following data mapping to map JDBC/SQL datatype to Arrow data types. + * + *

CHAR --> ArrowType.Utf8 + * NCHAR --> ArrowType.Utf8 + * VARCHAR --> ArrowType.Utf8 + * NVARCHAR --> ArrowType.Utf8 + * LONGVARCHAR --> ArrowType.Utf8 + * LONGNVARCHAR --> ArrowType.Utf8 + * NUMERIC --> ArrowType.Decimal(precision, scale) + * DECIMAL --> ArrowType.Decimal(precision, scale) + * BIT --> ArrowType.Bool + * TINYINT --> ArrowType.Int(8, signed) + * SMALLINT --> ArrowType.Int(16, signed) + * INTEGER --> ArrowType.Int(32, signed) + * BIGINT --> ArrowType.Int(64, signed) + * REAL --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) + * FLOAT --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) + * DOUBLE --> ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE) + * BINARY --> ArrowType.Binary + * VARBINARY --> ArrowType.Binary + * LONGVARBINARY --> ArrowType.Binary + * DATE --> ArrowType.Date(DateUnit.MILLISECOND) + * TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32) + * TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, timezone=null) + * CLOB --> ArrowType.Utf8 + * BLOB --> ArrowType.Binary + * + * @since 0.10.0 + */ +public class JdbcToArrow { + + /*----------------------------------------------------------------* + | | + | Partial Convert API | + | | + *----------------------------------------------------------------*/ + + /** + * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. + * Note here uses the default targetBatchSize = 1024. + * + * @param resultSet ResultSet to use to fetch the data from underlying database + * @param allocator Memory allocator + * @return Arrow Data Objects {@link ArrowVectorIterator} + * @throws SQLException on error + */ + public static ArrowVectorIterator sqlToArrowVectorIterator( + ResultSet resultSet, + BufferAllocator allocator) + throws SQLException, IOException { + Preconditions.checkNotNull(allocator, "Memory Allocator object can not be null"); + + JdbcToArrowConfig config = + new JdbcToArrowConfig(allocator, JdbcToArrowUtils.getUtcCalendar()); + return sqlToArrowVectorIterator(resultSet, config); + } + + /** + * For the given JDBC {@link ResultSet}, fetch the data from Relational DB and convert it to Arrow objects. + * Note if not specify {@link JdbcToArrowConfig#targetBatchSize}, will use default value 1024. + * + * @param resultSet ResultSet to use to fetch the data from underlying database + * @param config Configuration of the conversion from JDBC to Arrow. + * @return Arrow Data Objects {@link ArrowVectorIterator} + * @throws SQLException on error + */ + public static ArrowVectorIterator sqlToArrowVectorIterator( + ResultSet resultSet, + JdbcToArrowConfig config) + throws SQLException, IOException { + Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); + Preconditions.checkNotNull(config, "The configuration cannot be null"); + return ArrowVectorIterator.create(resultSet, config); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowConfig.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowConfig.java new file mode 100644 index 0000000..baa97f0 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowConfig.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor; + +import lombok.Getter; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.types.pojo.ArrowType; + +import java.math.RoundingMode; +import java.util.Calendar; +import java.util.Map; +import java.util.function.Function; + +/** + * This class configures the JDBC-to-Arrow conversion process. + *

+ * The allocator is used to construct the {@link org.apache.arrow.vector.VectorSchemaRoot}, + * and the calendar is used to define the time zone of any + * {@link ArrowType.Timestamp} + * fields that are created during the conversion. Neither field may be null. + *

+ *

+ * If the includeMetadata flag is set, the Arrow field metadata will contain information + * from the corresponding {@link java.sql.ResultSetMetaData} that was used to create the + * {@link org.apache.arrow.vector.types.pojo.FieldType} of the corresponding + * {@link org.apache.arrow.vector.FieldVector}. + *

+ *

+ * If there are any {@link java.sql.Types#ARRAY} fields in the {@link java.sql.ResultSet}, the corresponding + * {@link JdbcFieldInfo} for the array's contents must be defined here. Unfortunately, the sub-type + * information cannot be retrieved from all JDBC implementations (H2 for example, returns + * {@link java.sql.Types#NULL} for the array sub-type), so it must be configured here. The column index + * or name can be used to map to a {@link JdbcFieldInfo}, and that will be used for the conversion. + *

+ */ +public final class JdbcToArrowConfig { + + public static final int DEFAULT_TARGET_BATCH_SIZE = 1024; + public static final int NO_LIMIT_BATCH_SIZE = -1; + private final Calendar calendar; + private final BufferAllocator allocator; + private final boolean includeMetadata; + private final boolean reuseVectorSchemaRoot; + private final Map arraySubTypesByColumnIndex; + private final Map arraySubTypesByColumnName; + private final Map explicitTypesByColumnIndex; + private final Map explicitTypesByColumnName; + /** + * -- GETTER -- + * Return schema level metadata or null if not provided. + */ + @Getter + private final Map schemaMetadata; + /** + * -- GETTER -- + * Return metadata from columnIndex->meta map on per field basis + * or null if not provided. + */ + @Getter + private final Map> columnMetadataByColumnIndex; + @Getter + private final RoundingMode bigDecimalRoundingMode; + /** + * The maximum rowCount to read each time when partially convert data. + * Default value is 1024 and -1 means disable partial read. + * default is -1 which means disable partial read. + * Note that this flag only useful for {@link JdbcToArrow#sqlToArrowVectorIterator} + * 1) if targetBatchSize != -1, it will convert full data into multiple vectors + * with valueCount no more than targetBatchSize. + * 2) if targetBatchSize == -1, it will convert full data into a single vector in {@link ArrowVectorIterator} + *

+ */ + private final int targetBatchSize; + + private final Function jdbcToArrowTypeConverter; + + /** + * Constructs a new configuration from the provided allocator and calendar. The allocator + * is used when constructing the Arrow vectors from the ResultSet, and the calendar is used to define + * Arrow Timestamp fields, and to read time-based fields from the JDBC ResultSet. + * + * @param allocator The memory allocator to construct the Arrow vectors with. + * @param calendar The calendar to use when constructing Timestamp fields and reading time-based results. + */ + JdbcToArrowConfig(BufferAllocator allocator, Calendar calendar) { + this(allocator, calendar, + /* include metadata */ false, + /* reuse vector schema root */ false, + /* array sub-types by column index */ null, + /* array sub-types by column name */ null, + DEFAULT_TARGET_BATCH_SIZE, null, null); + } + + JdbcToArrowConfig( + BufferAllocator allocator, + Calendar calendar, + boolean includeMetadata, + boolean reuseVectorSchemaRoot, + Map arraySubTypesByColumnIndex, + Map arraySubTypesByColumnName, + int targetBatchSize, + Function jdbcToArrowTypeConverter) { + this(allocator, calendar, includeMetadata, reuseVectorSchemaRoot, arraySubTypesByColumnIndex, + arraySubTypesByColumnName, targetBatchSize, jdbcToArrowTypeConverter, null); + } + + /** + * Constructs a new configuration from the provided allocator and calendar. The allocator + * is used when constructing the Arrow vectors from the ResultSet, and the calendar is used to define + * Arrow Timestamp fields, and to read time-based fields from the JDBC ResultSet. + * + * @param allocator The memory allocator to construct the Arrow vectors with. + * @param calendar The calendar to use when constructing Timestamp fields and reading time-based results. + * @param includeMetadata Whether to include JDBC field metadata in the Arrow Schema Field metadata. + * @param reuseVectorSchemaRoot Whether to reuse the vector schema root for each data load. + * @param arraySubTypesByColumnIndex The type of the JDBC array at the column index (1-based). + * @param arraySubTypesByColumnName The type of the JDBC array at the column name. + * @param targetBatchSize The target batch size to be used in preallcation of the resulting vectors. + * @param jdbcToArrowTypeConverter The function that maps JDBC field type information to arrow type. If set to null, + * the default mapping will be used, which is defined as: + *
    + *
  • CHAR --> ArrowType.Utf8
  • + *
  • NCHAR --> ArrowType.Utf8
  • + *
  • VARCHAR --> ArrowType.Utf8
  • + *
  • NVARCHAR --> ArrowType.Utf8
  • + *
  • LONGVARCHAR --> ArrowType.Utf8
  • + *
  • LONGNVARCHAR --> ArrowType.Utf8
  • + *
  • NUMERIC --> ArrowType.Decimal(precision, scale)
  • + *
  • DECIMAL --> ArrowType.Decimal(precision, scale)
  • + *
  • BIT --> ArrowType.Bool
  • + *
  • TINYINT --> ArrowType.Int(8, signed)
  • + *
  • SMALLINT --> ArrowType.Int(16, signed)
  • + *
  • INTEGER --> ArrowType.Int(32, signed)
  • + *
  • BIGINT --> ArrowType.Int(64, signed)
  • + *
  • REAL --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
  • + *
  • FLOAT --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
  • + *
  • DOUBLE --> ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
  • + *
  • BINARY --> ArrowType.Binary
  • + *
  • VARBINARY --> ArrowType.Binary
  • + *
  • LONGVARBINARY --> ArrowType.Binary
  • + *
  • DATE --> ArrowType.Date(DateUnit.DAY)
  • + *
  • TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32)
  • + *
  • TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, calendar timezone)
  • + *
  • CLOB --> ArrowType.Utf8
  • + *
  • BLOB --> ArrowType.Binary
  • + *
  • ARRAY --> ArrowType.List
  • + *
  • STRUCT --> ArrowType.Struct
  • + *
  • NULL --> ArrowType.Null
  • + *
+ * @param bigDecimalRoundingMode The java.math.RoundingMode to be used in coercion of a BigDecimal from a + * ResultSet having a scale which does not match that of the target vector. Use null + * (default value) to require strict scale matching. + */ + JdbcToArrowConfig( + BufferAllocator allocator, + Calendar calendar, + boolean includeMetadata, + boolean reuseVectorSchemaRoot, + Map arraySubTypesByColumnIndex, + Map arraySubTypesByColumnName, + int targetBatchSize, + Function jdbcToArrowTypeConverter, + RoundingMode bigDecimalRoundingMode) { + + this( + allocator, + calendar, + includeMetadata, + reuseVectorSchemaRoot, + arraySubTypesByColumnIndex, + arraySubTypesByColumnName, + targetBatchSize, + jdbcToArrowTypeConverter, + null, + null, + null, + null, + bigDecimalRoundingMode); + } + + JdbcToArrowConfig( + BufferAllocator allocator, + Calendar calendar, + boolean includeMetadata, + boolean reuseVectorSchemaRoot, + Map arraySubTypesByColumnIndex, + Map arraySubTypesByColumnName, + int targetBatchSize, + Function jdbcToArrowTypeConverter, + Map explicitTypesByColumnIndex, + Map explicitTypesByColumnName, + Map schemaMetadata, + Map> columnMetadataByColumnIndex, + RoundingMode bigDecimalRoundingMode) { + Preconditions.checkNotNull(allocator, "Memory allocator cannot be null"); + this.allocator = allocator; + this.calendar = calendar; + this.includeMetadata = includeMetadata; + this.reuseVectorSchemaRoot = reuseVectorSchemaRoot; + this.arraySubTypesByColumnIndex = arraySubTypesByColumnIndex; + this.arraySubTypesByColumnName = arraySubTypesByColumnName; + this.targetBatchSize = targetBatchSize; + this.explicitTypesByColumnIndex = explicitTypesByColumnIndex; + this.explicitTypesByColumnName = explicitTypesByColumnName; + this.schemaMetadata = schemaMetadata; + this.columnMetadataByColumnIndex = columnMetadataByColumnIndex; + this.bigDecimalRoundingMode = bigDecimalRoundingMode; + + // set up type converter + this.jdbcToArrowTypeConverter = jdbcToArrowTypeConverter != null ? jdbcToArrowTypeConverter : + (jdbcFieldInfo) -> JdbcToArrowUtils.getArrowTypeFromJdbcType(jdbcFieldInfo, calendar); + } + + /** + * The calendar to use when defining Arrow Timestamp fields + * and retrieving {@link java.sql.Date}, {@link java.sql.Time}, or {@link java.sql.Timestamp} + * data types from the {@link java.sql.ResultSet}, or null if not converting. + * + * @return the calendar. + */ + public Calendar getCalendar() { + return calendar; + } + + /** + * The Arrow memory allocator. + * + * @return the allocator. + */ + public BufferAllocator getAllocator() { + return allocator; + } + + /** + * Whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata. + * + * @return true to include field metadata, false to exclude it. + */ + public boolean shouldIncludeMetadata() { + return includeMetadata; + } + + /** + * Get the target batch size for partial read. + */ + public int getTargetBatchSize() { + return targetBatchSize; + } + + /** + * Get whether it is allowed to reuse the vector schema root. + */ + public boolean isReuseVectorSchemaRoot() { + return reuseVectorSchemaRoot; + } + + /** + * Gets the mapping between JDBC type information to Arrow type. + */ + public Function getJdbcToArrowTypeConverter() { + return jdbcToArrowTypeConverter; + } + + /** + * Returns the array sub-type {@link JdbcFieldInfo} defined for the provided column index. + * + * @param index The {@link java.sql.ResultSetMetaData} column index of an {@link java.sql.Types#ARRAY} type. + * @return The {@link JdbcFieldInfo} for that array's sub-type, or null if not defined. + */ + public JdbcFieldInfo getArraySubTypeByColumnIndex(int index) { + if (arraySubTypesByColumnIndex == null) { + return null; + } else { + return arraySubTypesByColumnIndex.get(index); + } + } + + /** + * Returns the array sub-type {@link JdbcFieldInfo} defined for the provided column name. + * + * @param name The {@link java.sql.ResultSetMetaData} column name of an {@link java.sql.Types#ARRAY} type. + * @return The {@link JdbcFieldInfo} for that array's sub-type, or null if not defined. + */ + public JdbcFieldInfo getArraySubTypeByColumnName(String name) { + if (arraySubTypesByColumnName == null) { + return null; + } else { + return arraySubTypesByColumnName.get(name); + } + } + + /** + * Returns the type {@link JdbcFieldInfo} explicitly defined for the provided column index. + * + * @param index The {@link java.sql.ResultSetMetaData} column index to evaluate for explicit type mapping. + * @return The {@link JdbcFieldInfo} defined for the column, or null if not defined. + */ + public JdbcFieldInfo getExplicitTypeByColumnIndex(int index) { + if (explicitTypesByColumnIndex == null) { + return null; + } else { + return explicitTypesByColumnIndex.get(index); + } + } + + /** + * Returns the type {@link JdbcFieldInfo} explicitly defined for the provided column name. + * + * @param name The {@link java.sql.ResultSetMetaData} column name to evaluate for explicit type mapping. + * @return The {@link JdbcFieldInfo} defined for the column, or null if not defined. + */ + public JdbcFieldInfo getExplicitTypeByColumnName(String name) { + if (explicitTypesByColumnName == null) { + return null; + } else { + return explicitTypesByColumnName.get(name); + } + } + +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowConfigBuilder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowConfigBuilder.java new file mode 100644 index 0000000..2874d33 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowConfigBuilder.java @@ -0,0 +1,284 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.types.pojo.ArrowType; + +import java.math.RoundingMode; +import java.util.Calendar; +import java.util.Map; +import java.util.function.Function; + + +/** + * This class builds {@link JdbcToArrowConfig}s. + */ +public class JdbcToArrowConfigBuilder { + private Calendar calendar; + private BufferAllocator allocator; + private boolean includeMetadata; + private boolean reuseVectorSchemaRoot; + private Map arraySubTypesByColumnIndex; + private Map arraySubTypesByColumnName; + private Map explicitTypesByColumnIndex; + private Map explicitTypesByColumnName; + private Map schemaMetadata; + private Map> columnMetadataByColumnIndex; + private int targetBatchSize; + private Function jdbcToArrowTypeConverter; + private RoundingMode bigDecimalRoundingMode; + + /** + * Default constructor for the JdbcToArrowConfigBuilder}. + * Use the setter methods for the allocator and calendar; the allocator must be + * set. Otherwise, {@link #build()} will throw a {@link NullPointerException}. + */ + public JdbcToArrowConfigBuilder() { + this.allocator = null; + this.calendar = null; + this.includeMetadata = false; + this.reuseVectorSchemaRoot = false; + this.arraySubTypesByColumnIndex = null; + this.arraySubTypesByColumnName = null; + this.explicitTypesByColumnIndex = null; + this.explicitTypesByColumnName = null; + this.schemaMetadata = null; + this.columnMetadataByColumnIndex = null; + this.bigDecimalRoundingMode = null; + } + + /** + * Constructor for the JdbcToArrowConfigBuilder. The + * allocator is required, and a {@link NullPointerException} + * will be thrown if it is null. + *

+ * The allocator is used to construct Arrow vectors from the JDBC ResultSet. + * The calendar is used to determine the time zone of {@link java.sql.Timestamp} + * fields and convert {@link java.sql.Date}, {@link java.sql.Time}, and + * {@link java.sql.Timestamp} fields to a single, common time zone when reading + * from the result set. + *

+ * + * @param allocator The Arrow Vector memory allocator. + * @param calendar The calendar to use when constructing timestamp fields. + */ + public JdbcToArrowConfigBuilder(BufferAllocator allocator, Calendar calendar) { + this(); + + Preconditions.checkNotNull(allocator, "Memory allocator cannot be null"); + + this.allocator = allocator; + this.calendar = calendar; + this.includeMetadata = false; + this.reuseVectorSchemaRoot = false; + this.targetBatchSize = JdbcToArrowConfig.DEFAULT_TARGET_BATCH_SIZE; + } + + /** + * Constructor for the JdbcToArrowConfigBuilder. Both the + * allocator and calendar are required. A {@link NullPointerException} + * will be thrown if either of those arguments is null. + *

+ * The allocator is used to construct Arrow vectors from the JDBC ResultSet. + * The calendar is used to determine the time zone of {@link java.sql.Timestamp} + * fields and convert {@link java.sql.Date}, {@link java.sql.Time}, and + * {@link java.sql.Timestamp} fields to a single, common time zone when reading + * from the result set. + *

+ *

+ * The includeMetadata argument, if true will cause + * various information about each database field to be added to the Vector + * Schema's field metadata. + *

+ * + * @param allocator The Arrow Vector memory allocator. + * @param calendar The calendar to use when constructing timestamp fields. + */ + public JdbcToArrowConfigBuilder(BufferAllocator allocator, Calendar calendar, boolean includeMetadata) { + this(allocator, calendar); + this.includeMetadata = includeMetadata; + } + + /** + * Sets the memory allocator to use when constructing the Arrow vectors from the ResultSet. + * + * @param allocator the allocator to set. + * @throws NullPointerException if allocator is null. + */ + public JdbcToArrowConfigBuilder setAllocator(BufferAllocator allocator) { + Preconditions.checkNotNull(allocator, "Memory allocator cannot be null"); + this.allocator = allocator; + return this; + } + + /** + * Sets the {@link Calendar} to use when constructing timestamp fields in the + * Arrow schema, and reading time-based fields from the JDBC ResultSet. + * + * @param calendar the calendar to set. + */ + public JdbcToArrowConfigBuilder setCalendar(Calendar calendar) { + this.calendar = calendar; + return this; + } + + /** + * Sets whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata. + * + * @param includeMetadata Whether to include or exclude JDBC metadata in the Arrow Schema field metadata. + * @return This instance of the JdbcToArrowConfig, for chaining. + */ + public JdbcToArrowConfigBuilder setIncludeMetadata(boolean includeMetadata) { + this.includeMetadata = includeMetadata; + return this; + } + + /** + * Sets the mapping of column-index-to-{@link JdbcFieldInfo} used for columns of type {@link java.sql.Types#ARRAY}. + * The column index is 1-based, to match the JDBC column index. + * + * @param map The mapping. + * @return This instance of the JdbcToArrowConfig, for chaining. + */ + public JdbcToArrowConfigBuilder setArraySubTypeByColumnIndexMap(Map map) { + this.arraySubTypesByColumnIndex = map; + return this; + } + + /** + * Sets the mapping of column-name-to-{@link JdbcFieldInfo} used for columns of type {@link java.sql.Types#ARRAY}. + * + * @param map The mapping. + * @return This instance of the JdbcToArrowConfig, for chaining. + */ + public JdbcToArrowConfigBuilder setArraySubTypeByColumnNameMap(Map map) { + this.arraySubTypesByColumnName = map; + return this; + } + + /** + * Sets the mapping of column-index-to-{@link JdbcFieldInfo} used for column types. + *

+ * This can be useful to override type information from JDBC drivers that provide incomplete type info, + * e.g. DECIMAL with precision = scale = 0. + *

+ * The column index is 1-based, to match the JDBC column index. + * + * @param map The mapping. + */ + public JdbcToArrowConfigBuilder setExplicitTypesByColumnIndex(Map map) { + this.explicitTypesByColumnIndex = map; + return this; + } + + /** + * Sets the mapping of column-name-to-{@link JdbcFieldInfo} used for column types. + *

+ * This can be useful to override type information from JDBC drivers that provide incomplete type info, + * e.g. DECIMAL with precision = scale = 0. + * + * @param map The mapping. + */ + public JdbcToArrowConfigBuilder setExplicitTypesByColumnName(Map map) { + this.explicitTypesByColumnName = map; + return this; + } + + /** + * Set the target number of rows to convert at once. + *

+ * Use {@link JdbcToArrowConfig#NO_LIMIT_BATCH_SIZE} to read all rows at once. + */ + public JdbcToArrowConfigBuilder setTargetBatchSize(int targetBatchSize) { + this.targetBatchSize = targetBatchSize; + return this; + } + + /** + * Set the function used to convert JDBC types to Arrow types. + *

+ * Defaults to wrapping {@link JdbcToArrowUtils#getArrowTypeFromJdbcType(JdbcFieldInfo, Calendar)}. + */ + public JdbcToArrowConfigBuilder setJdbcToArrowTypeConverter( + Function jdbcToArrowTypeConverter) { + this.jdbcToArrowTypeConverter = jdbcToArrowTypeConverter; + return this; + } + + /** + * Set whether to use the same {@link org.apache.arrow.vector.VectorSchemaRoot} instance on each iteration, + * or to allocate a new one. + */ + public JdbcToArrowConfigBuilder setReuseVectorSchemaRoot(boolean reuseVectorSchemaRoot) { + this.reuseVectorSchemaRoot = reuseVectorSchemaRoot; + return this; + } + + /** + * Set metadata for schema. + */ + public JdbcToArrowConfigBuilder setSchemaMetadata(Map schemaMetadata) { + this.schemaMetadata = schemaMetadata; + return this; + } + + /** + * Set metadata from columnIndex->meta map on per field basis. + */ + public JdbcToArrowConfigBuilder setColumnMetadataByColumnIndex( + Map> columnMetadataByColumnIndex) { + this.columnMetadataByColumnIndex = columnMetadataByColumnIndex; + return this; + } + + /** + * Set the rounding mode used when the scale of the actual value does not match the declared scale. + *

+ * By default, an error is raised in such cases. + */ + public JdbcToArrowConfigBuilder setBigDecimalRoundingMode(RoundingMode bigDecimalRoundingMode) { + this.bigDecimalRoundingMode = bigDecimalRoundingMode; + return this; + } + + /** + * This builds the {@link JdbcToArrowConfig} from the provided + * {@link BufferAllocator} and {@link Calendar}. + * + * @return The built {@link JdbcToArrowConfig} + * @throws NullPointerException if either the allocator or calendar was not set. + */ + public JdbcToArrowConfig build() { + return new JdbcToArrowConfig( + allocator, + calendar, + includeMetadata, + reuseVectorSchemaRoot, + arraySubTypesByColumnIndex, + arraySubTypesByColumnName, + targetBatchSize, + jdbcToArrowTypeConverter, + explicitTypesByColumnIndex, + explicitTypesByColumnName, + schemaMetadata, + columnMetadataByColumnIndex, + bigDecimalRoundingMode); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowUtils.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowUtils.java new file mode 100644 index 0000000..49d6609 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/JdbcToArrowUtils.java @@ -0,0 +1,448 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor; + +import org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer.*; + +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.*; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.types.DateUnit; +import static org.apache.arrow.vector.types.FloatingPointPrecision.DOUBLE; +import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ValueVectorUtility; + +import java.io.IOException; +import java.math.RoundingMode; +import java.sql.Date; +import java.sql.*; +import java.util.*; + +/** + * Class that does most of the work to convert JDBC ResultSet data into Arrow columnar format Vector objects. + * + * @since 0.10.0 + */ +public class JdbcToArrowUtils { + + private static final int JDBC_ARRAY_VALUE_COLUMN = 2; + + /** + * Returns the instance of a {java.util.Calendar} with the UTC time zone and root locale. + */ + public static Calendar getUtcCalendar() { + return Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT); + } + + /** + * Create Arrow {@link Schema} object for the given JDBC {@link ResultSetMetaData}. + * + * @param rsmd The ResultSetMetaData containing the results, to read the JDBC metadata from. + * @param calendar The calendar to use the time zone field of, to construct Timestamp fields from. + * @return {@link Schema} + * @throws SQLException on error + */ + public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, Calendar calendar) throws SQLException { + Preconditions.checkNotNull(calendar, "Calendar object can't be null"); + + return jdbcToArrowSchema(rsmd, new JdbcToArrowConfig(new RootAllocator(0), calendar)); + } + + /** + * Create Arrow {@link Schema} object for the given JDBC {@link ResultSetMetaData}. + * + * @param parameterMetaData The ResultSetMetaData containing the results, to read the JDBC metadata from. + * @param calendar The calendar to use the time zone field of, to construct Timestamp fields from. + * @return {@link Schema} + * @throws SQLException on error + */ + public static Schema jdbcToArrowSchema(final ParameterMetaData parameterMetaData, final Calendar calendar) + throws SQLException { + Preconditions.checkNotNull(calendar, "Calendar object can't be null"); + Preconditions.checkNotNull(parameterMetaData); + final List parameterFields = new ArrayList<>(parameterMetaData.getParameterCount()); + for (int parameterCounter = 1; parameterCounter <= parameterMetaData.getParameterCount(); + parameterCounter++) { + final int jdbcDataType = parameterMetaData.getParameterType(parameterCounter); + final int jdbcIsNullable = parameterMetaData.isNullable(parameterCounter); + final boolean arrowIsNullable = jdbcIsNullable != ParameterMetaData.parameterNoNulls; + final int precision = parameterMetaData.getPrecision(parameterCounter); + final int scale = parameterMetaData.getScale(parameterCounter); + final ArrowType arrowType = getArrowTypeFromJdbcType(new JdbcFieldInfo(jdbcDataType, precision, scale), calendar); + final FieldType fieldType = new FieldType(arrowIsNullable, arrowType, /*dictionary=*/null); + parameterFields.add(new Field(null, fieldType, null)); + } + + return new Schema(parameterFields); + } + + /** + * Converts the provided JDBC type to its respective {@link ArrowType} counterpart. + * + * @param fieldInfo the {@link JdbcFieldInfo} with information about the original JDBC type. + * @param calendar the {@link Calendar} to use for datetime data types. + * @return a new {@link ArrowType}. + */ + public static ArrowType getArrowTypeFromJdbcType(final JdbcFieldInfo fieldInfo, final Calendar calendar) { + switch (fieldInfo.getJdbcType()) { + case Types.BOOLEAN: + case Types.BIT: + return new ArrowType.Bool(); + case Types.TINYINT: + return new ArrowType.Int(8, true); + case Types.SMALLINT: + return new ArrowType.Int(16, true); + case Types.INTEGER: + return new ArrowType.Int(32, true); + case Types.BIGINT: + return new ArrowType.Int(64, true); + case Types.NUMERIC: + case Types.DECIMAL: + int precision = fieldInfo.getPrecision(); + int scale = fieldInfo.getScale(); + return new ArrowType.Decimal(precision, scale, 128); + case Types.REAL: + case Types.FLOAT: + return new ArrowType.FloatingPoint(SINGLE); + case Types.DOUBLE: + return new ArrowType.FloatingPoint(DOUBLE); + case Types.CHAR: + case Types.NCHAR: + case Types.VARCHAR: + case Types.NVARCHAR: + case Types.LONGVARCHAR: + case Types.LONGNVARCHAR: + case Types.CLOB: + return new ArrowType.Utf8(); + case Types.DATE: + return new ArrowType.Date(DateUnit.DAY); + case Types.TIME: + return new ArrowType.Time(TimeUnit.MILLISECOND, 32); + case Types.TIMESTAMP: + final String timezone; + if (calendar != null) { + timezone = calendar.getTimeZone().getID(); + } else { + timezone = null; + } + return new ArrowType.Timestamp(TimeUnit.MILLISECOND, timezone); + case Types.BINARY: + case Types.VARBINARY: + case Types.LONGVARBINARY: + case Types.BLOB: + return new ArrowType.Binary(); + case Types.ARRAY: + return new ArrowType.List(); + case Types.NULL: + return new ArrowType.Null(); + case Types.STRUCT: + return new ArrowType.Struct(); + default: + // no-op, shouldn't get here + return null; + } + } + + /** + * Create Arrow {@link Schema} object for the given JDBC {@link ResultSetMetaData}. + * + *

+ * If {@link JdbcToArrowConfig#shouldIncludeMetadata()} returns true, the following fields + * will be added to the {@link FieldType#getMetadata()}: + *

    + *
  • {@link Constants#SQL_CATALOG_NAME_KEY} representing {@link ResultSetMetaData#getCatalogName(int)}
  • + *
  • {@link Constants#SQL_TABLE_NAME_KEY} representing {@link ResultSetMetaData#getTableName(int)}
  • + *
  • {@link Constants#SQL_COLUMN_NAME_KEY} representing {@link ResultSetMetaData#getColumnLabel(int)}
  • + *
  • {@link Constants#SQL_TYPE_KEY} representing {@link ResultSetMetaData#getColumnTypeName(int)}
  • + *
+ *

+ *

+ * If any columns are of type {@link Types#ARRAY}, the configuration object will be used to look up + * the array sub-type field. The {@link JdbcToArrowConfig#getArraySubTypeByColumnIndex(int)} method will be + * checked first, followed by the {@link JdbcToArrowConfig#getArraySubTypeByColumnName(String)} method. + *

+ * + * @param rsmd The ResultSetMetaData containing the results, to read the JDBC metadata from. + * @param config The configuration to use when constructing the schema. + * @return {@link Schema} + * @throws SQLException on error + * @throws IllegalArgumentException if rsmd contains an {@link Types#ARRAY} but the + * config does not have a sub-type definition for it. + */ + public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, JdbcToArrowConfig config) throws SQLException { + Preconditions.checkNotNull(rsmd, "JDBC ResultSetMetaData object can't be null"); + Preconditions.checkNotNull(config, "The configuration object must not be null"); + + List fields = new ArrayList<>(); + int columnCount = rsmd.getColumnCount(); + for (int i = 1; i <= columnCount; i++) { + final String columnName = rsmd.getColumnLabel(i); + + final Map columnMetadata = config.getColumnMetadataByColumnIndex() != null ? + config.getColumnMetadataByColumnIndex().get(i) : null; + final Map metadata; + if (config.shouldIncludeMetadata()) { + metadata = new HashMap<>(); + metadata.put(Constants.SQL_CATALOG_NAME_KEY, rsmd.getCatalogName(i)); + metadata.put(Constants.SQL_SCHEMA_NAME_KEY, rsmd.getSchemaName(i)); + metadata.put(Constants.SQL_TABLE_NAME_KEY, rsmd.getTableName(i)); + metadata.put(Constants.SQL_COLUMN_NAME_KEY, columnName); + metadata.put(Constants.SQL_TYPE_KEY, rsmd.getColumnTypeName(i)); + if (columnMetadata != null && !columnMetadata.isEmpty()) { + metadata.putAll(columnMetadata); + } + } else { + if (columnMetadata != null && !columnMetadata.isEmpty()) { + metadata = columnMetadata; + } else { + metadata = null; + } + } + + final JdbcFieldInfo columnFieldInfo = getJdbcFieldInfoForColumn(rsmd, i, config); + final ArrowType arrowType = config.getJdbcToArrowTypeConverter().apply(columnFieldInfo); + if (arrowType != null) { + final FieldType fieldType = new FieldType( + isColumnNullable(rsmd, i, columnFieldInfo), arrowType, /* dictionary encoding */ null, metadata); + + List children = null; + if (arrowType.getTypeID() == ArrowType.List.TYPE_TYPE) { + final JdbcFieldInfo arrayFieldInfo = getJdbcFieldInfoForArraySubType(rsmd, i, config); + if (arrayFieldInfo == null) { + throw new IllegalArgumentException("Configuration does not provide a mapping for array column " + i); + } + children = new ArrayList(); + final ArrowType childType = config.getJdbcToArrowTypeConverter().apply(arrayFieldInfo); + children.add(new Field("child", FieldType.nullable(childType), null)); + } else if (arrowType.getTypeID() == ArrowType.ArrowTypeID.Map) { + FieldType mapType = new FieldType(false, ArrowType.Struct.INSTANCE, null, null); + FieldType keyType = new FieldType(false, new ArrowType.Utf8(), null, null); + FieldType valueType = new FieldType(false, new ArrowType.Utf8(), null, null); + children = new ArrayList<>(); + children.add(new Field("child", mapType, + Arrays.asList(new Field(MapVector.KEY_NAME, keyType, null), + new Field(MapVector.VALUE_NAME, valueType, null)))); + } + + fields.add(new Field(columnName, fieldType, children)); + } + } + return new Schema(fields, config.getSchemaMetadata()); + } + + static JdbcFieldInfo getJdbcFieldInfoForColumn( + ResultSetMetaData rsmd, + int arrayColumn, + JdbcToArrowConfig config) + throws SQLException { + Preconditions.checkNotNull(rsmd, "ResultSet MetaData object cannot be null"); + Preconditions.checkNotNull(config, "Configuration must not be null"); + Preconditions.checkArgument( + arrayColumn > 0, + "ResultSetMetaData columns start with 1; column cannot be less than 1"); + Preconditions.checkArgument( + arrayColumn <= rsmd.getColumnCount(), + "Column number cannot be more than the number of columns"); + + JdbcFieldInfo fieldInfo = config.getExplicitTypeByColumnIndex(arrayColumn); + if (fieldInfo == null) { + fieldInfo = config.getExplicitTypeByColumnName(rsmd.getColumnLabel(arrayColumn)); + } + if (fieldInfo != null) { + return fieldInfo; + } + return new JdbcFieldInfo(rsmd, arrayColumn); + } + + /* Uses the configuration to determine what the array sub-type JdbcFieldInfo is. + * If no sub-type can be found, returns null. + */ + private static JdbcFieldInfo getJdbcFieldInfoForArraySubType( + ResultSetMetaData rsmd, + int arrayColumn, + JdbcToArrowConfig config) + throws SQLException { + + Preconditions.checkNotNull(rsmd, "ResultSet MetaData object cannot be null"); + Preconditions.checkNotNull(config, "Configuration must not be null"); + Preconditions.checkArgument( + arrayColumn > 0, + "ResultSetMetaData columns start with 1; column cannot be less than 1"); + Preconditions.checkArgument( + arrayColumn <= rsmd.getColumnCount(), + "Column number cannot be more than the number of columns"); + + JdbcFieldInfo fieldInfo = config.getArraySubTypeByColumnIndex(arrayColumn); + if (fieldInfo == null) { + fieldInfo = config.getArraySubTypeByColumnName(rsmd.getColumnLabel(arrayColumn)); + } + return fieldInfo; + } + + /** + * Iterate the given JDBC {@link ResultSet} object to fetch the data and transpose it to populate + * the given Arrow Vector objects. + * + * @param rs ResultSet to use to fetch the data from underlying database + * @param root Arrow {@link VectorSchemaRoot} object to populate + * @param calendar The calendar to use when reading {@link Date}, {@link Time}, or {@link Timestamp} + * data types from the {@link ResultSet}, or null if not converting. + * @throws SQLException on error + */ + public static void jdbcToArrowVectors(ResultSet rs, VectorSchemaRoot root, Calendar calendar) + throws SQLException, IOException { + + Preconditions.checkNotNull(calendar, "Calendar object can't be null"); + + jdbcToArrowVectors(rs, root, new JdbcToArrowConfig(new RootAllocator(0), calendar)); + } + + static boolean isColumnNullable(ResultSetMetaData resultSetMetadata, int index, JdbcFieldInfo info) + throws SQLException { + int nullableValue; + if (info != null && info.isNullable() != ResultSetMetaData.columnNullableUnknown) { + nullableValue = info.isNullable(); + } else { + nullableValue = resultSetMetadata.isNullable(index); + } + return nullableValue == ResultSetMetaData.columnNullable || + nullableValue == ResultSetMetaData.columnNullableUnknown; + } + + /** + * Iterate the given JDBC {@link ResultSet} object to fetch the data and transpose it to populate + * the given Arrow Vector objects. + * + * @param rs ResultSet to use to fetch the data from underlying database + * @param root Arrow {@link VectorSchemaRoot} object to populate + * @param config The configuration to use when reading the data. + * @throws SQLException on error + */ + public static void jdbcToArrowVectors(ResultSet rs, VectorSchemaRoot root, JdbcToArrowConfig config) + throws SQLException, IOException { + + ResultSetMetaData rsmd = rs.getMetaData(); + int columnCount = rsmd.getColumnCount(); + + JdbcConsumer[] consumers = new JdbcConsumer[columnCount]; + for (int i = 1; i <= columnCount; i++) { + FieldVector vector = root.getVector(rsmd.getColumnLabel(i)); + final JdbcFieldInfo columnFieldInfo = getJdbcFieldInfoForColumn(rsmd, i, config); + consumers[i - 1] = getConsumer( + vector.getField().getType(), i, isColumnNullable(rsmd, i, columnFieldInfo), vector, config); + } + + CompositeJdbcConsumer compositeConsumer = null; + // Only clean resources when occurs error, + // vectors within consumers are useful and users are responsible for its close. + try { + compositeConsumer = new CompositeJdbcConsumer(consumers); + int readRowCount = 0; + if (config.getTargetBatchSize() == JdbcToArrowConfig.NO_LIMIT_BATCH_SIZE) { + while (rs.next()) { + ValueVectorUtility.ensureCapacity(root, readRowCount + 1); + compositeConsumer.consume(rs); + readRowCount++; + } + } else { + while (readRowCount < config.getTargetBatchSize() && rs.next()) { + compositeConsumer.consume(rs); + readRowCount++; + } + } + + root.setRowCount(readRowCount); + } catch (Exception e) { + // error occurs and clean up resources. + if (compositeConsumer != null) { + compositeConsumer.close(); + } + throw e; + } + } + + static JdbcConsumer getConsumer(ArrowType arrowType, int columnIndex, boolean nullable, + FieldVector vector, JdbcToArrowConfig config) { + final Calendar calendar = config.getCalendar(); + + switch (arrowType.getTypeID()) { + case Bool: + return BitConsumer.createConsumer((BitVector) vector, columnIndex, nullable); + case Int: + switch (((ArrowType.Int) arrowType).getBitWidth()) { + case 8: + return TinyIntConsumer.createConsumer((TinyIntVector) vector, columnIndex, nullable); + case 16: + return SmallIntConsumer.createConsumer((SmallIntVector) vector, columnIndex, nullable); + case 32: + return IntConsumer.createConsumer((IntVector) vector, columnIndex, nullable); + case 64: + return BigIntConsumer.createConsumer((BigIntVector) vector, columnIndex, nullable); + default: + return null; + } + case Decimal: + final RoundingMode bigDecimalRoundingMode = config.getBigDecimalRoundingMode(); + return DecimalConsumer.createConsumer((DecimalVector) vector, columnIndex, nullable, bigDecimalRoundingMode); + case FloatingPoint: + switch (((ArrowType.FloatingPoint) arrowType).getPrecision()) { + case SINGLE: + return FloatConsumer.createConsumer((Float4Vector) vector, columnIndex, nullable); + case DOUBLE: + return DoubleConsumer.createConsumer((Float8Vector) vector, columnIndex, nullable); + default: + return null; + } + case Utf8: + case LargeUtf8: + return VarCharConsumer.createConsumer((VarCharVector) vector, columnIndex, nullable); + case Binary: + case LargeBinary: + return BinaryConsumer.createConsumer((VarBinaryVector) vector, columnIndex, nullable); + case Date: + return DateConsumer.createConsumer((DateDayVector) vector, columnIndex, nullable, calendar); + case Time: + return TimeConsumer.createConsumer((TimeMilliVector) vector, columnIndex, nullable, calendar); + case Timestamp: + if (config.getCalendar() == null) { + return TimestampConsumer.createConsumer((TimeStampMilliVector) vector, columnIndex, nullable); + } else { + return TimestampTZConsumer.createConsumer((TimeStampMilliTZVector) vector, columnIndex, nullable, calendar); + } + case List: + FieldVector childVector = ((ListVector) vector).getDataVector(); + JdbcConsumer delegate = getConsumer(childVector.getField().getType(), JDBC_ARRAY_VALUE_COLUMN, + childVector.getField().isNullable(), childVector, config); + return ArrayConsumer.createConsumer((ListVector) vector, delegate, columnIndex, nullable); + case Map: + return MapConsumer.createConsumer((MapVector) vector, columnIndex, nullable); + case Null: + return new NullConsumer((NullVector) vector); + default: + // no-op, shouldn't get here + throw new UnsupportedOperationException(); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BaseColumnBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BaseColumnBinder.java new file mode 100644 index 0000000..e6702b3 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BaseColumnBinder.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.FieldVector; + +/** + * Base class for ColumnBinder implementations. + * + * @param The concrete FieldVector subtype. + */ +public abstract class BaseColumnBinder implements ColumnBinder { + protected final V vector; + protected final int jdbcType; + + public BaseColumnBinder(V vector, int jdbcType) { + this.vector = vector; + this.jdbcType = jdbcType; + } + + @Override + public int getJdbcType() { + return jdbcType; + } + + @Override + public V getVector() { + return vector; + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BigIntBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BigIntBinder.java new file mode 100644 index 0000000..da91d17 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BigIntBinder.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.BigIntVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A column binder for 8-bit integers. + */ +public class BigIntBinder extends BaseColumnBinder { + public BigIntBinder(BigIntVector vector) { + this(vector, Types.BIGINT); + } + + public BigIntBinder(BigIntVector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final long value = vector.getDataBuffer().getLong((long) rowIndex * BigIntVector.TYPE_WIDTH); + statement.setLong(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BitBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BitBinder.java new file mode 100644 index 0000000..ebd5909 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/BitBinder.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.BitVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A column binder for booleans. + */ +public class BitBinder extends BaseColumnBinder { + public BitBinder(BitVector vector) { + this(vector, Types.BOOLEAN); + } + + public BitBinder(BitVector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + // See BitVector#getBit + final int byteIndex = rowIndex >> 3; + final byte b = vector.getDataBuffer().getByte(byteIndex); + final int bitIndex = rowIndex & 7; + final int value = (b >> bitIndex) & 0x01; + statement.setBoolean(parameterIndex, value != 0); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ColumnBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ColumnBinder.java new file mode 100644 index 0000000..f518150 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ColumnBinder.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.FieldVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; + +/** + * A helper to bind values from a wrapped Arrow vector to a JDBC PreparedStatement. + */ +public interface ColumnBinder { + /** + * Create a column binder for a vector, using the default JDBC type code for null values. + */ + static ColumnBinder forVector(FieldVector vector) { + return forVector(vector, /*jdbcType*/ null); + } + + /** + * Create a column binder for a vector, overriding the JDBC type code used for null values. + * + * @param vector The vector that the column binder will wrap. + * @param jdbcType The JDBC type code to use (or null to use the default). + */ + static ColumnBinder forVector(FieldVector vector, Integer jdbcType) { + final ColumnBinder binder = vector.getField().getType().accept(new ColumnBinderArrowTypeVisitor(vector, jdbcType)); + if (vector.getField().isNullable()) { + return new NullableColumnBinder(binder); + } + return binder; + } + + /** + * Bind the given row to the given parameter. + * + * @param statement The statement to bind to. + * @param parameterIndex The parameter to bind to (1-indexed) + * @param rowIndex The row to bind values from (0-indexed) + * @throws SQLException if an error occurs + */ + void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException; + + /** + * Get the JDBC type code used by this binder. + * + * @return A type code from {@link java.sql.Types}. + */ + int getJdbcType(); + + /** + * Get the vector used by this binder. + */ + FieldVector getVector(); +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ColumnBinderArrowTypeVisitor.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ColumnBinderArrowTypeVisitor.java new file mode 100644 index 0000000..89ce9ab --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ColumnBinderArrowTypeVisitor.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.*; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.types.pojo.ArrowType; + +import java.sql.Types; +import java.time.ZoneId; +import java.util.Calendar; +import java.util.TimeZone; + +/** + * Visitor to create the base ColumnBinder for a vector. + *

+ * To handle null values, wrap the returned binder in a {@link NullableColumnBinder}. + */ +public class ColumnBinderArrowTypeVisitor implements ArrowType.ArrowTypeVisitor { + private final FieldVector vector; + private final Integer jdbcType; + + /** + * Create a binder using a custom JDBC type code. + * + * @param vector The vector that the binder will wrap. + * @param jdbcType The JDBC type code (or null to use the default). + */ + public ColumnBinderArrowTypeVisitor(FieldVector vector, Integer jdbcType) { + this.vector = vector; + this.jdbcType = jdbcType; + } + + @Override + public ColumnBinder visit(ArrowType.Null type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + + @Override + public ColumnBinder visit(ArrowType.Struct type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + + @Override + public ColumnBinder visit(ArrowType.List type) { + return new ListBinder((ListVector) vector); + } + + @Override + public ColumnBinder visit(ArrowType.LargeList type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + + @Override + public ColumnBinder visit(ArrowType.FixedSizeList type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + + @Override + public ColumnBinder visit(ArrowType.Union type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + + @Override + public ColumnBinder visit(ArrowType.Map type) { + return new MapBinder((MapVector) vector); + } + + @Override + public ColumnBinder visit(ArrowType.Int type) { + if (!type.getIsSigned()) { + throw new UnsupportedOperationException( + "No column binder implemented for unsigned type " + type); + } + switch (type.getBitWidth()) { + case 8: + return jdbcType == null ? new TinyIntBinder((TinyIntVector) vector) : + new TinyIntBinder((TinyIntVector) vector, jdbcType); + case 16: + return jdbcType == null ? new SmallIntBinder((SmallIntVector) vector) : + new SmallIntBinder((SmallIntVector) vector, jdbcType); + case 32: + return jdbcType == null ? new IntBinder((IntVector) vector) : + new IntBinder((IntVector) vector, jdbcType); + case 64: + return jdbcType == null ? new BigIntBinder((BigIntVector) vector) : + new BigIntBinder((BigIntVector) vector, jdbcType); + default: + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + } + + @Override + public ColumnBinder visit(ArrowType.FloatingPoint type) { + switch (type.getPrecision()) { + case SINGLE: + return jdbcType == null ? new Float4Binder((Float4Vector) vector) : + new Float4Binder((Float4Vector) vector, jdbcType); + case DOUBLE: + return jdbcType == null ? new Float8Binder((Float8Vector) vector) : + new Float8Binder((Float8Vector) vector, jdbcType); + default: + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + } + + @Override + public ColumnBinder visit(ArrowType.Utf8 type) { + VarCharVector varChar = (VarCharVector) vector; + return jdbcType == null ? new VarCharBinder<>(varChar, Types.VARCHAR) : + new VarCharBinder<>(varChar, jdbcType); + } + + @Override + public ColumnBinder visit(ArrowType.LargeUtf8 type) { + LargeVarCharVector varChar = (LargeVarCharVector) vector; + return jdbcType == null ? new VarCharBinder<>(varChar, Types.LONGVARCHAR) : + new VarCharBinder<>(varChar, jdbcType); + } + + @Override + public ColumnBinder visit(ArrowType.Binary type) { + VarBinaryVector varBinary = (VarBinaryVector) vector; + return jdbcType == null ? new VarBinaryBinder<>(varBinary, Types.VARBINARY) : + new VarBinaryBinder<>(varBinary, jdbcType); + } + + @Override + public ColumnBinder visit(ArrowType.LargeBinary type) { + LargeVarBinaryVector varBinary = (LargeVarBinaryVector) vector; + return jdbcType == null ? new VarBinaryBinder<>(varBinary, Types.LONGVARBINARY) : + new VarBinaryBinder<>(varBinary, jdbcType); + } + + @Override + public ColumnBinder visit(ArrowType.FixedSizeBinary type) { + FixedSizeBinaryVector binary = (FixedSizeBinaryVector) vector; + return jdbcType == null ? new FixedSizeBinaryBinder(binary, Types.BINARY) : + new FixedSizeBinaryBinder(binary, jdbcType); + } + + @Override + public ColumnBinder visit(ArrowType.Bool type) { + return jdbcType == null ? new BitBinder((BitVector) vector) : new BitBinder((BitVector) vector, jdbcType); + } + + @Override + public ColumnBinder visit(ArrowType.Decimal type) { + if (type.getBitWidth() == 128) { + DecimalVector decimalVector = (DecimalVector) vector; + return jdbcType == null ? new Decimal128Binder(decimalVector) : new Decimal128Binder(decimalVector, jdbcType); + } else if (type.getBitWidth() == 256) { + Decimal256Vector decimalVector = (Decimal256Vector) vector; + return jdbcType == null ? new Decimal256Binder(decimalVector) : new Decimal256Binder(decimalVector, jdbcType); + } + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + + @Override + public ColumnBinder visit(ArrowType.Date type) { + switch (type.getUnit()) { + case DAY: + return jdbcType == null ? new DateDayBinder((DateDayVector) vector) : + new DateDayBinder((DateDayVector) vector, /*calendar*/null, jdbcType); + case MILLISECOND: + return jdbcType == null ? new DateMilliBinder((DateMilliVector) vector) : + new DateMilliBinder((DateMilliVector) vector, /*calendar*/null, jdbcType); + default: + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + } + + @Override + public ColumnBinder visit(ArrowType.Time type) { + switch (type.getUnit()) { + case SECOND: + return jdbcType == null ? new Time32Binder((TimeSecVector) vector) : + new Time32Binder((TimeSecVector) vector, jdbcType); + case MILLISECOND: + return jdbcType == null ? new Time32Binder((TimeMilliVector) vector) : + new Time32Binder((TimeMilliVector) vector, jdbcType); + case MICROSECOND: + return jdbcType == null ? new Time64Binder((TimeMicroVector) vector) : + new Time64Binder((TimeMicroVector) vector, jdbcType); + case NANOSECOND: + return jdbcType == null ? new Time64Binder((TimeNanoVector) vector) : + new Time64Binder((TimeNanoVector) vector, jdbcType); + default: + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + } + + @Override + public ColumnBinder visit(ArrowType.Timestamp type) { + Calendar calendar = null; + final String timezone = type.getTimezone(); + if (timezone != null && !timezone.isEmpty()) { + calendar = Calendar.getInstance(TimeZone.getTimeZone(ZoneId.of(timezone))); + } + return new TimeStampBinder((TimeStampVector) vector, calendar); + } + + @Override + public ColumnBinder visit(ArrowType.Interval type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } + + @Override + public ColumnBinder visit(ArrowType.Duration type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/DateDayBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/DateDayBinder.java new file mode 100644 index 0000000..d1bd580 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/DateDayBinder.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.DateDayVector; + +import java.sql.Date; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; +import java.util.Calendar; + +/** + * A column binder for 32-bit dates. + */ +public class DateDayBinder extends BaseColumnBinder { + private static final long MILLIS_PER_DAY = 86_400_000; + private final Calendar calendar; + + public DateDayBinder(DateDayVector vector) { + this(vector, null, Types.DATE); + } + + public DateDayBinder(DateDayVector vector, Calendar calendar) { + this(vector, calendar, Types.DATE); + } + + public DateDayBinder(DateDayVector vector, Calendar calendar, int jdbcType) { + super(vector, jdbcType); + this.calendar = calendar; + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + // TODO: multiply with overflow + final long index = (long) rowIndex * DateDayVector.TYPE_WIDTH; + final Date value = new Date(vector.getDataBuffer().getInt(index) * MILLIS_PER_DAY); + if (calendar == null) { + statement.setDate(parameterIndex, value); + } else { + statement.setDate(parameterIndex, value, calendar); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/DateMilliBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/DateMilliBinder.java new file mode 100644 index 0000000..1d25423 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/DateMilliBinder.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.DateMilliVector; + +import java.sql.Date; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; +import java.util.Calendar; + +/** + * A column binder for 64-bit dates. + */ +public class DateMilliBinder extends BaseColumnBinder { + private final Calendar calendar; + + public DateMilliBinder(DateMilliVector vector) { + this(vector, null, Types.DATE); + } + + public DateMilliBinder(DateMilliVector vector, Calendar calendar) { + this(vector, calendar, Types.DATE); + } + + + public DateMilliBinder(DateMilliVector vector, Calendar calendar, int jdbcType) { + super(vector, jdbcType); + this.calendar = calendar; + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final long index = (long) rowIndex * DateMilliVector.TYPE_WIDTH; + final Date value = new Date(vector.getDataBuffer().getLong(index)); + if (calendar == null) { + statement.setDate(parameterIndex, value); + } else { + statement.setDate(parameterIndex, value, calendar); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Decimal128Binder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Decimal128Binder.java new file mode 100644 index 0000000..8a00fd8 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Decimal128Binder.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.DecimalVector; +import org.apache.arrow.vector.util.DecimalUtility; + +import java.math.BigDecimal; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A binder for 128-bit decimals. + */ +public class Decimal128Binder extends BaseColumnBinder { + public Decimal128Binder(DecimalVector vector) { + this(vector, Types.DECIMAL); + } + + public Decimal128Binder(DecimalVector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final BigDecimal value = DecimalUtility.getBigDecimalFromArrowBuf( + vector.getDataBuffer(), rowIndex, vector.getScale(), DecimalVector.TYPE_WIDTH); + statement.setBigDecimal(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Decimal256Binder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Decimal256Binder.java new file mode 100644 index 0000000..314534e --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Decimal256Binder.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.Decimal256Vector; +import org.apache.arrow.vector.util.DecimalUtility; + +import java.math.BigDecimal; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A binder for 256-bit decimals. + */ +public class Decimal256Binder extends BaseColumnBinder { + public Decimal256Binder(Decimal256Vector vector) { + this(vector, Types.DECIMAL); + } + + public Decimal256Binder(Decimal256Vector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final BigDecimal value = DecimalUtility.getBigDecimalFromArrowBuf( + vector.getDataBuffer(), rowIndex, vector.getScale(), Decimal256Vector.TYPE_WIDTH); + statement.setBigDecimal(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/FixedSizeBinaryBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/FixedSizeBinaryBinder.java new file mode 100644 index 0000000..b202891 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/FixedSizeBinaryBinder.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.FixedSizeBinaryVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; + +/** + * A binder for fixed-width binary types. + */ +public class FixedSizeBinaryBinder extends BaseColumnBinder { + /** + * Create a binder for the given vector using the given JDBC type for null values. + * + * @param vector The vector to draw values from. + * @param jdbcType The JDBC type code. + */ + public FixedSizeBinaryBinder(FixedSizeBinaryVector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + byte[] binaryData = new byte[vector.getByteWidth()]; + vector.getDataBuffer().getBytes((long) rowIndex * binaryData.length, binaryData, 0, binaryData.length); + statement.setBytes(parameterIndex, binaryData); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Float4Binder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Float4Binder.java new file mode 100644 index 0000000..01b6606 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Float4Binder.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.Float4Vector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A binder for 32-bit floats. + */ +public class Float4Binder extends BaseColumnBinder { + public Float4Binder(Float4Vector vector) { + this(vector, Types.REAL); + } + + public Float4Binder(Float4Vector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final float value = vector.getDataBuffer().getFloat((long) rowIndex * Float4Vector.TYPE_WIDTH); + statement.setFloat(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Float8Binder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Float8Binder.java new file mode 100644 index 0000000..1568657 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Float8Binder.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.Float8Vector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A binder for 64-bit floats. + */ +public class Float8Binder extends BaseColumnBinder { + public Float8Binder(Float8Vector vector) { + this(vector, Types.DOUBLE); + } + + public Float8Binder(Float8Vector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final double value = vector.getDataBuffer().getDouble((long) rowIndex * Float8Vector.TYPE_WIDTH); + statement.setDouble(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/IntBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/IntBinder.java new file mode 100644 index 0000000..77291e0 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/IntBinder.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.IntVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A column binder for 32-bit integers. + */ +public class IntBinder extends BaseColumnBinder { + public IntBinder(IntVector vector) { + this(vector, Types.INTEGER); + } + + public IntBinder(IntVector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final int value = vector.getDataBuffer().getInt((long) rowIndex * IntVector.TYPE_WIDTH); + statement.setInt(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ListBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ListBinder.java new file mode 100644 index 0000000..0d09456 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/ListBinder.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.util.Text; + +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * A column binder for list of primitive values. + */ +public class ListBinder extends BaseColumnBinder { + + private final UnionListReader listReader; + private final Class arrayElementClass; + private final boolean isTextColumn; + + public ListBinder(ListVector vector) { + this(vector, java.sql.Types.ARRAY); + } + + /** + * Init ListBinder and determine type of data vector. + * + * @param vector corresponding data vector from arrow buffer for binding + * @param jdbcType parameter jdbc type + */ + public ListBinder(ListVector vector, int jdbcType) { + super(vector, jdbcType); + listReader = vector.getReader(); + Class dataVectorClass = vector.getDataVector().getClass(); + try { + arrayElementClass = dataVectorClass.getMethod("getObject", Integer.TYPE).getReturnType(); + } catch (NoSuchMethodException e) { + final String message = String.format("Issue to determine type for getObject method of data vector class %s ", + dataVectorClass.getName()); + throw new RuntimeException(message); + } + isTextColumn = arrayElementClass.isAssignableFrom(Text.class); + } + + @Override + public void bind(java.sql.PreparedStatement statement, int parameterIndex, int rowIndex) throws java.sql.SQLException { + listReader.setPosition(rowIndex); + ArrayList sourceArray = (ArrayList) listReader.readObject(); + Object array; + if (!isTextColumn) { + array = Array.newInstance(arrayElementClass, sourceArray.size()); + Arrays.setAll((Object[]) array, sourceArray::get); + } else { + array = new String[sourceArray.size()]; + Arrays.setAll((Object[]) array, idx -> sourceArray.get(idx) != null ? sourceArray.get(idx).toString() : null); + } + statement.setObject(parameterIndex, array); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/MapBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/MapBinder.java new file mode 100644 index 0000000..25b0d74 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/MapBinder.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.impl.UnionMapReader; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.JsonStringHashMap; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Objects; + +/** + * A column binder for map of primitive values. + */ +public class MapBinder extends BaseColumnBinder { + + private final boolean isTextKey; + private final boolean isTextValue; + private UnionMapReader reader; + + public MapBinder(MapVector vector) { + this(vector, Types.VARCHAR); + } + + /** + * Init MapBinder and determine type of data vector. + * + * @param vector corresponding data vector from arrow buffer for binding + * @param jdbcType parameter jdbc type + */ + public MapBinder(MapVector vector, int jdbcType) { + super(vector, jdbcType); + reader = vector.getReader(); + List structField = Objects.requireNonNull(vector.getField()).getChildren(); + if (structField.size() != 1) { + throw new IllegalArgumentException("Expected Struct field metadata inside Map field"); + } + List keyValueFields = Objects.requireNonNull(structField.get(0)).getChildren(); + if (keyValueFields.size() != 2) { + throw new IllegalArgumentException("Expected two children fields " + + "inside nested Struct field in Map"); + } + ArrowType keyType = Objects.requireNonNull(keyValueFields.get(0)).getType(); + ArrowType valueType = Objects.requireNonNull(keyValueFields.get(1)).getType(); + isTextKey = ArrowType.Utf8.INSTANCE.equals(keyType); + isTextValue = ArrowType.Utf8.INSTANCE.equals(valueType); + } + + @Override + public void bind(PreparedStatement statement, + int parameterIndex, int rowIndex) throws SQLException { + reader.setPosition(rowIndex); + LinkedHashMap tags = new JsonStringHashMap<>(); + while (reader.next()) { + Object key = reader.key().readObject(); + Object value = reader.value().readObject(); + tags.put(isTextKey && key != null ? key.toString() : key, + isTextValue && value != null ? value.toString() : value); + } + switch (jdbcType) { + case Types.VARCHAR: + statement.setString(parameterIndex, tags.toString()); + break; + case Types.OTHER: + default: + statement.setObject(parameterIndex, tags); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/NullableColumnBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/NullableColumnBinder.java new file mode 100644 index 0000000..f765462 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/NullableColumnBinder.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.FieldVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; + +/** + * A ColumnBinder that checks for nullability before deferring to a type-specific binder. + */ +public class NullableColumnBinder implements ColumnBinder { + private final ColumnBinder wrapped; + + public NullableColumnBinder(ColumnBinder wrapped) { + this.wrapped = wrapped; + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + if (wrapped.getVector().isNull(rowIndex)) { + statement.setNull(parameterIndex, wrapped.getJdbcType()); + } else { + wrapped.bind(statement, parameterIndex, rowIndex); + } + } + + @Override + public int getJdbcType() { + return wrapped.getJdbcType(); + } + + @Override + public FieldVector getVector() { + return wrapped.getVector(); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/SmallIntBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/SmallIntBinder.java new file mode 100644 index 0000000..87f75a6 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/SmallIntBinder.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.SmallIntVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A column binder for 8-bit integers. + */ +public class SmallIntBinder extends BaseColumnBinder { + public SmallIntBinder(SmallIntVector vector) { + this(vector, Types.SMALLINT); + } + + public SmallIntBinder(SmallIntVector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final short value = vector.getDataBuffer().getShort((short) rowIndex * SmallIntVector.TYPE_WIDTH); + statement.setShort(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Time32Binder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Time32Binder.java new file mode 100644 index 0000000..d01b737 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Time32Binder.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.TimeMilliVector; +import org.apache.arrow.vector.TimeSecVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Time; +import java.sql.Types; + +/** + * A binder for 32-bit time types. + */ +public class Time32Binder extends BaseColumnBinder { + private static final long TYPE_WIDTH = 4; + + private final long factor; + + public Time32Binder(TimeSecVector vector) { + this(vector, Types.TIME); + } + + public Time32Binder(TimeMilliVector vector) { + this(vector, Types.TIME); + } + + public Time32Binder(TimeSecVector vector, int jdbcType) { + this(vector, /*factor*/1_000, jdbcType); + } + + public Time32Binder(TimeMilliVector vector, int jdbcType) { + this(vector, /*factor*/1, jdbcType); + } + + Time32Binder(BaseFixedWidthVector vector, long factor, int jdbcType) { + super(vector, jdbcType); + this.factor = factor; + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + // TODO: multiply with overflow + // TODO: take in a Calendar as well? + final Time value = new Time(vector.getDataBuffer().getInt(rowIndex * TYPE_WIDTH) * factor); + statement.setTime(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Time64Binder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Time64Binder.java new file mode 100644 index 0000000..12a8d5a --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/Time64Binder.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.TimeMicroVector; +import org.apache.arrow.vector.TimeNanoVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Time; +import java.sql.Types; + +/** + * A binder for 64-bit time types. + */ +public class Time64Binder extends BaseColumnBinder { + private static final long TYPE_WIDTH = 8; + + private final long factor; + + public Time64Binder(TimeMicroVector vector) { + this(vector, Types.TIME); + } + + public Time64Binder(TimeNanoVector vector) { + this(vector, Types.TIME); + } + + public Time64Binder(TimeMicroVector vector, int jdbcType) { + this(vector, /*factor*/1_000, jdbcType); + } + + public Time64Binder(TimeNanoVector vector, int jdbcType) { + this(vector, /*factor*/1_000_000, jdbcType); + } + + Time64Binder(BaseFixedWidthVector vector, long factor, int jdbcType) { + super(vector, jdbcType); + this.factor = factor; + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + // TODO: option to throw on truncation (vendor Guava IntMath#multiply) + final Time value = new Time(vector.getDataBuffer().getLong(rowIndex * TYPE_WIDTH) / factor); + statement.setTime(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/TimeStampBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/TimeStampBinder.java new file mode 100644 index 0000000..da859d1 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/TimeStampBinder.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.TimeStampVector; +import org.apache.arrow.vector.types.pojo.ArrowType; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Timestamp; +import java.sql.Types; +import java.util.Calendar; + +/** + * A column binder for timestamps. + */ +public class TimeStampBinder extends BaseColumnBinder { + private final Calendar calendar; + private final long unitsPerSecond; + private final long nanosPerUnit; + + /** + * Create a binder for a timestamp vector using the default JDBC type code. + */ + public TimeStampBinder(TimeStampVector vector, Calendar calendar) { + this(vector, calendar, isZoned(vector.getField().getType()) ? Types.TIMESTAMP_WITH_TIMEZONE : Types.TIMESTAMP); + } + + /** + * Create a binder for a timestamp vector. + * + * @param vector The vector to pull values from. + * @param calendar Optionally, the calendar to pass to JDBC. + * @param jdbcType The JDBC type code to use for null values. + */ + public TimeStampBinder(TimeStampVector vector, Calendar calendar, int jdbcType) { + super(vector, jdbcType); + this.calendar = calendar; + + final ArrowType.Timestamp type = (ArrowType.Timestamp) vector.getField().getType(); + switch (type.getUnit()) { + case SECOND: + this.unitsPerSecond = 1; + this.nanosPerUnit = 1_000_000_000; + break; + case MILLISECOND: + this.unitsPerSecond = 1_000; + this.nanosPerUnit = 1_000_000; + break; + case MICROSECOND: + this.unitsPerSecond = 1_000_000; + this.nanosPerUnit = 1_000; + break; + case NANOSECOND: + this.unitsPerSecond = 1_000_000_000; + this.nanosPerUnit = 1; + break; + default: + throw new IllegalArgumentException("Invalid time unit in " + type); + } + } + + private static boolean isZoned(ArrowType type) { + final String timezone = ((ArrowType.Timestamp) type).getTimezone(); + return timezone != null && !timezone.isEmpty(); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + // TODO: option to throw on truncation (vendor Guava IntMath#multiply) or overflow + final long rawValue = vector.getDataBuffer().getLong((long) rowIndex * TimeStampVector.TYPE_WIDTH); + final long seconds = rawValue / unitsPerSecond; + final int nanos = (int) ((rawValue - (seconds * unitsPerSecond)) * nanosPerUnit); + final Timestamp value = new Timestamp(seconds * 1_000); + value.setNanos(nanos); + if (calendar != null) { + // Timestamp == Date == UTC timestamp (confusingly). Arrow's timestamp with timezone is a UTC value with a + // zone offset, so we don't need to do any conversion. + statement.setTimestamp(parameterIndex, value, calendar); + } else { + // Arrow timestamp without timezone isn't strictly convertible to any timezone. So this is technically wrong, + // but there is no 'correct' interpretation here. The application should provide a calendar. + statement.setTimestamp(parameterIndex, value); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/TinyIntBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/TinyIntBinder.java new file mode 100644 index 0000000..616bca2 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/TinyIntBinder.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.vector.TinyIntVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +/** + * A column binder for 8-bit integers. + */ +public class TinyIntBinder extends BaseColumnBinder { + public TinyIntBinder(TinyIntVector vector) { + this(vector, Types.TINYINT); + } + + public TinyIntBinder(TinyIntVector vector, int jdbcType) { + super(vector, jdbcType); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + final byte value = vector.getDataBuffer().getByte((long) rowIndex * TinyIntVector.TYPE_WIDTH); + statement.setByte(parameterIndex, value); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/VarBinaryBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/VarBinaryBinder.java new file mode 100644 index 0000000..5cb3dba --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/VarBinaryBinder.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.vector.ElementAddressableVector; +import org.apache.arrow.vector.FieldVector; + +import java.sql.PreparedStatement; +import java.sql.SQLException; + +/** + * A binder for variable-width binary types. + * + * @param The binary vector. + */ +public class VarBinaryBinder extends BaseColumnBinder { + private final ArrowBufPointer element; + + /** + * Create a binder for the given vector using the given JDBC type for null values. + * + * @param vector The vector to draw values from. + * @param jdbcType The JDBC type code. + */ + public VarBinaryBinder(T vector, int jdbcType) { + super(vector, jdbcType); + this.element = new ArrowBufPointer(); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + vector.getDataPointer(rowIndex, element); + if (element.getBuf() == null) { + statement.setNull(parameterIndex, jdbcType); + return; + } + if (element.getLength() > (long) Integer.MAX_VALUE) { + final String message = String.format("Length of value at index %d (%d) exceeds Integer.MAX_VALUE", + rowIndex, element.getLength()); + throw new RuntimeException(message); + } + byte[] binaryData = new byte[(int) element.getLength()]; + element.getBuf().getBytes(element.getOffset(), binaryData); + statement.setBytes(parameterIndex, binaryData); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/VarCharBinder.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/VarCharBinder.java new file mode 100644 index 0000000..eb458f8 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/VarCharBinder.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; + +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VariableWidthVector; + +import java.nio.charset.StandardCharsets; +import java.sql.PreparedStatement; +import java.sql.SQLException; + +/** + * A binder for variable-width string types. + * + * @param The text vector. + */ +public class VarCharBinder extends BaseColumnBinder { + private final ArrowBufPointer element; + + /** + * Create a binder for the given vector using the given JDBC type for null values. + * + * @param vector The vector to draw values from. + * @param jdbcType The JDBC type code. + */ + public VarCharBinder(T vector, int jdbcType) { + super(vector, jdbcType); + this.element = new ArrowBufPointer(); + } + + @Override + public void bind(PreparedStatement statement, int parameterIndex, int rowIndex) throws SQLException { + vector.getDataPointer(rowIndex, element); + if (element.getBuf() == null) { + statement.setNull(parameterIndex, jdbcType); + return; + } + if (element.getLength() > (long) Integer.MAX_VALUE) { + final String message = String.format("Length of value at index %d (%d) exceeds Integer.MAX_VALUE", + rowIndex, element.getLength()); + throw new RuntimeException(message); + } + byte[] utf8Bytes = new byte[(int) element.getLength()]; + element.getBuf().getBytes(element.getOffset(), utf8Bytes); + statement.setString(parameterIndex, new String(utf8Bytes, StandardCharsets.UTF_8)); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/package-info.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/package-info.java new file mode 100644 index 0000000..76674e0 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/binder/package-info.java @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Utilities to bind Arrow data as JDBC prepared statement parameters. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.binder; diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/ArrayConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/ArrayConsumer.java new file mode 100644 index 0000000..85cdb09 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/ArrayConsumer.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.ListVector; + +import java.io.IOException; +import java.sql.Array; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume array type values from {@link ResultSet}. + * Write the data to {@link ListVector}. + */ +public abstract class ArrayConsumer extends BaseConsumer { + + protected final JdbcConsumer delegate; + private final ValueVector innerVector; + protected int innerVectorIndex = 0; + + /** + * Instantiate a ArrayConsumer. + */ + public ArrayConsumer(ListVector vector, JdbcConsumer delegate, int index) { + super(vector, index); + this.delegate = delegate; + this.innerVector = vector.getDataVector(); + } + + /** + * Creates a consumer for {@link ListVector}. + */ + public static ArrayConsumer createConsumer( + ListVector vector, JdbcConsumer delegate, int index, boolean nullable) { + if (nullable) { + return new NullableArrayConsumer(vector, delegate, index); + } else { + return new NonNullableArrayConsumer(vector, delegate, index); + } + } + + @Override + public void close() throws Exception { + this.vector.close(); + this.delegate.close(); + } + + @Override + public void resetValueVector(ListVector vector) { + super.resetValueVector(vector); + + FieldVector childVector = vector.getDataVector(); + this.delegate.resetValueVector(childVector); + + innerVectorIndex = 0; + } + + void ensureInnerVectorCapacity(int targetCapacity) { + while (innerVector.getValueCapacity() < targetCapacity) { + innerVector.reAlloc(); + } + } + + /** + * Nullable consumer for {@link ListVector}. + */ + static class NullableArrayConsumer extends ArrayConsumer { + + /** + * Instantiate a nullable array consumer. + */ + public NullableArrayConsumer(ListVector vector, JdbcConsumer delegate, int index) { + super(vector, delegate, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException, IOException { + final Array array = resultSet.getArray(columnIndexInResultSet); + if (!resultSet.wasNull()) { + vector.startNewValue(currentIndex); + int count = 0; + try (ResultSet rs = array.getResultSet()) { + while (rs.next()) { + ensureInnerVectorCapacity(innerVectorIndex + count + 1); + delegate.consume(rs); + count++; + } + } + vector.endValue(currentIndex, count); + innerVectorIndex += count; + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for {@link ListVector}. + */ + static class NonNullableArrayConsumer extends ArrayConsumer { + + /** + * Instantiate a nullable array consumer. + */ + public NonNullableArrayConsumer(ListVector vector, JdbcConsumer delegate, int index) { + super(vector, delegate, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException, IOException { + final Array array = resultSet.getArray(columnIndexInResultSet); + vector.startNewValue(currentIndex); + int count = 0; + try (ResultSet rs = array.getResultSet()) { + while (rs.next()) { + ensureInnerVectorCapacity(innerVectorIndex + count + 1); + delegate.consume(rs); + count++; + } + } + vector.endValue(currentIndex, count); + innerVectorIndex += count; + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BaseConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BaseConsumer.java new file mode 100644 index 0000000..c896941 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BaseConsumer.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.ValueVector; + +/** + * Base class for all consumers. + * + * @param vector type. + */ +public abstract class BaseConsumer implements JdbcConsumer { + + protected final int columnIndexInResultSet; + protected V vector; + protected int currentIndex; + + /** + * Constructs a new consumer. + * + * @param vector the underlying vector for the consumer. + * @param index the column id for the consumer. + */ + public BaseConsumer(V vector, int index) { + this.vector = vector; + this.columnIndexInResultSet = index; + } + + @Override + public void close() throws Exception { + this.vector.close(); + } + + @Override + public void resetValueVector(V vector) { + this.vector = vector; + this.currentIndex = 0; + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BigIntConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BigIntConsumer.java new file mode 100644 index 0000000..4e7f5b1 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BigIntConsumer.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.BigIntVector; + +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume bigint type values from {@link ResultSet}. + * Write the data to {@link BigIntVector}. + */ +public class BigIntConsumer { + + /** + * Creates a consumer for {@link BigIntVector}. + */ + public static JdbcConsumer createConsumer(BigIntVector vector, int index, boolean nullable) { + if (nullable) { + return new NullableBigIntConsumer(vector, index); + } else { + return new NonNullableBigIntConsumer(vector, index); + } + } + + /** + * Nullable consumer for big int. + */ + static class NullableBigIntConsumer extends BaseConsumer { + + /** + * Instantiate a BigIntConsumer. + */ + public NullableBigIntConsumer(BigIntVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + long value = resultSet.getLong(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for big int. + */ + static class NonNullableBigIntConsumer extends BaseConsumer { + + /** + * Instantiate a BigIntConsumer. + */ + public NonNullableBigIntConsumer(BigIntVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + long value = resultSet.getLong(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BinaryConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BinaryConsumer.java new file mode 100644 index 0000000..0b05edd --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BinaryConsumer.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.VarBinaryVector; + +import java.io.IOException; +import java.io.InputStream; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume binary type values from {@link ResultSet}. + * Write the data to {@link VarBinaryVector}. + */ +public abstract class BinaryConsumer extends BaseConsumer { + + private final byte[] reuseBytes = new byte[1024]; + + /** + * Instantiate a BinaryConsumer. + */ + public BinaryConsumer(VarBinaryVector vector, int index) { + super(vector, index); + if (vector != null) { + vector.allocateNewSafe(); + } + } + + /** + * Creates a consumer for {@link VarBinaryVector}. + */ + public static BinaryConsumer createConsumer(VarBinaryVector vector, int index, boolean nullable) { + if (nullable) { + return new NullableBinaryConsumer(vector, index); + } else { + return new NonNullableBinaryConsumer(vector, index); + } + } + + /** + * consume a InputStream. + */ + public void consume(InputStream is) throws IOException { + if (is != null) { + while (currentIndex >= vector.getValueCapacity()) { + vector.reallocValidityAndOffsetBuffers(); + } + final int startOffset = vector.getStartOffset(currentIndex); + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + int dataLength = 0; + int read; + while ((read = is.read(reuseBytes)) != -1) { + while (vector.getDataBuffer().capacity() < (startOffset + dataLength + read)) { + vector.reallocDataBuffer(); + } + vector.getDataBuffer().setBytes(startOffset + dataLength, reuseBytes, 0, read); + dataLength += read; + } + offsetBuffer.setInt((currentIndex + 1) * VarBinaryVector.OFFSET_WIDTH, startOffset + dataLength); + BitVectorHelper.setBit(vector.getValidityBuffer(), currentIndex); + vector.setLastSet(currentIndex); + } else { + final int startOffset = vector.getStartOffset(currentIndex); + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + offsetBuffer.setInt((currentIndex + 1) * VarBinaryVector.OFFSET_WIDTH, startOffset + 0); + vector.setLastSet(currentIndex); + } + } + + public void moveWriterPosition() { + currentIndex++; + } + + @Override + public void resetValueVector(VarBinaryVector vector) { + this.vector = vector; + this.vector.allocateNewSafe(); + this.currentIndex = 0; + } + + /** + * Consumer for nullable binary data. + */ + static class NullableBinaryConsumer extends BinaryConsumer { + + /** + * Instantiate a BinaryConsumer. + */ + public NullableBinaryConsumer(VarBinaryVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException, IOException { + InputStream is = resultSet.getBinaryStream(columnIndexInResultSet); + consume(is); + moveWriterPosition(); + } + } + + /** + * Consumer for non-nullable binary data. + */ + static class NonNullableBinaryConsumer extends BinaryConsumer { + + /** + * Instantiate a BinaryConsumer. + */ + public NonNullableBinaryConsumer(VarBinaryVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException, IOException { + InputStream is = resultSet.getBinaryStream(columnIndexInResultSet); + consume(is); + moveWriterPosition(); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BitConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BitConsumer.java new file mode 100644 index 0000000..6f935e4 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BitConsumer.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.BitVector; + +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume bit type values from {@link ResultSet}. + * Write the data to {@link BitVector}. + */ +public class BitConsumer { + + /** + * Creates a consumer for {@link BitVector}. + */ + public static JdbcConsumer createConsumer(BitVector vector, int index, boolean nullable) { + if (nullable) { + return new NullableBitConsumer(vector, index); + } else { + return new NonNullableBitConsumer(vector, index); + } + } + + /** + * Nullable consumer for {@link BitVector}. + */ + static class NullableBitConsumer extends BaseConsumer { + + /** + * Instantiate a BitConsumer. + */ + public NullableBitConsumer(BitVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + boolean value = resultSet.getBoolean(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value ? 1 : 0); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for {@link BitVector}. + */ + static class NonNullableBitConsumer extends BaseConsumer { + + /** + * Instantiate a BitConsumer. + */ + public NonNullableBitConsumer(BitVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + boolean value = resultSet.getBoolean(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value ? 1 : 0); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BlobConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BlobConsumer.java new file mode 100644 index 0000000..153c187 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/BlobConsumer.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.VarBinaryVector; + +import java.io.IOException; +import java.sql.Blob; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume blob type values from {@link ResultSet}. + * Write the data to {@link VarBinaryVector}. + */ +public class BlobConsumer extends BaseConsumer { + + private final boolean nullable; + private BinaryConsumer delegate; + + /** + * Instantiate a BlobConsumer. + */ + public BlobConsumer(BinaryConsumer delegate, int index, boolean nullable) { + super(null, index); + this.delegate = delegate; + this.nullable = nullable; + } + + /** + * Creates a consumer for {@link VarBinaryVector}. + */ + public static BlobConsumer createConsumer( + BinaryConsumer delegate, int index, boolean nullable) { + return new BlobConsumer(delegate, index, nullable); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException, IOException { + Blob blob = resultSet.getBlob(columnIndexInResultSet); + if (blob != null) { + delegate.consume(blob.getBinaryStream()); + } + delegate.moveWriterPosition(); + } + + @Override + public void close() throws Exception { + delegate.close(); + } + + @Override + public void resetValueVector(VarBinaryVector vector) { + delegate = BinaryConsumer.createConsumer(vector, columnIndexInResultSet, nullable); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/ClobConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/ClobConsumer.java new file mode 100644 index 0000000..7f737b0 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/ClobConsumer.java @@ -0,0 +1,168 @@ +///* +// * Licensed to the Apache Software Foundation (ASF) under one or more +// * contributor license agreements. See the NOTICE file distributed with +// * this work for additional information regarding copyright ownership. +// * The ASF licenses this file to You under the Apache License, Version 2.0 +// * (the "License"); you may not use this file except in compliance with +// * the License. You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ +// +//package com.antgroup.antchain.fastdf.dataproxy.service.connector.rdbms.adaptor.consumer; +// +//import org.apache.arrow.memory.ArrowBuf; +//import org.apache.arrow.memory.util.MemoryUtil; +//import org.apache.arrow.vector.BitVectorHelper; +//import org.apache.arrow.vector.VarCharVector; +// +//import java.nio.charset.StandardCharsets; +//import java.sql.Clob; +//import java.sql.ResultSet; +//import java.sql.SQLException; +// +///** +// * Consumer which consume clob type values from {@link ResultSet}. +// * Write the data to {@link VarCharVector}. +// */ +//public abstract class ClobConsumer extends BaseConsumer { +// +// /** +// * Creates a consumer for {@link VarCharVector}. +// */ +// public static ClobConsumer createConsumer(VarCharVector vector, int index, boolean nullable) { +// if (nullable) { +// return new NullableClobConsumer(vector, index); +// } else { +// return new NonNullableClobConsumer(vector, index); +// } +// } +// +// private static final int BUFFER_SIZE = 256; +// +// /** +// * Instantiate a ClobConsumer. +// */ +// public ClobConsumer(VarCharVector vector, int index) { +// super(vector, index); +// if (vector != null) { +// vector.allocateNewSafe(); +// } +// } +// +// @Override +// public void resetValueVector(VarCharVector vector) { +// this.vector = vector; +// this.vector.allocateNewSafe(); +// this.currentIndex = 0; +// } +// +// /** +// * Nullable consumer for clob data. +// */ +// static class NullableClobConsumer extends ClobConsumer { +// +// /** +// * Instantiate a ClobConsumer. +// */ +// public NullableClobConsumer(VarCharVector vector, int index) { +// super(vector, index); +// } +// +// @Override +// public void consume(ResultSet resultSet) throws SQLException { +// Clob clob = resultSet.getClob(columnIndexInResultSet); +// if (!resultSet.wasNull()) { +// if (clob != null) { +// long length = clob.length(); +// +// int read = 1; +// int readSize = length < BUFFER_SIZE ? (int) length : BUFFER_SIZE; +// int totalBytes = 0; +// +// ArrowBuf dataBuffer = vector.getDataBuffer(); +// ArrowBuf offsetBuffer = vector.getOffsetBuffer(); +// int startIndex = offsetBuffer.getInt(currentIndex * 4); +// while (read <= length) { +// String str = clob.getSubString(read, readSize); +// byte[] bytes = str.getBytes(StandardCharsets.UTF_8); +// +// while ((dataBuffer.writerIndex() + bytes.length) > dataBuffer.capacity()) { +// vector.reallocDataBuffer(); +// } +// MemoryUtil.UNSAFE.copyMemory( +// bytes, +// MemoryUtil.BYTE_ARRAY_BASE_OFFSET, +// null, +// dataBuffer.memoryAddress() + startIndex + totalBytes, +// bytes.length); +// +// totalBytes += bytes.length; +// read += readSize; +// } +// offsetBuffer.setInt((currentIndex + 1) * 4, startIndex + totalBytes); +// BitVectorHelper.setBit(vector.getValidityBuffer(), currentIndex); +// vector.setLastSet(currentIndex); +// } +// } +// currentIndex++; +// } +// } +// +// /** +// * Non-nullable consumer for clob data. +// */ +// static class NonNullableClobConsumer extends ClobConsumer { +// +// /** +// * Instantiate a ClobConsumer. +// */ +// public NonNullableClobConsumer(VarCharVector vector, int index) { +// super(vector, index); +// } +// +// @Override +// public void consume(ResultSet resultSet) throws SQLException { +// Clob clob = resultSet.getClob(columnIndexInResultSet); +// if (clob != null) { +// long length = clob.length(); +// +// int read = 1; +// int readSize = length < BUFFER_SIZE ? (int) length : BUFFER_SIZE; +// int totalBytes = 0; +// +// ArrowBuf dataBuffer = vector.getDataBuffer(); +// ArrowBuf offsetBuffer = vector.getOffsetBuffer(); +// int startIndex = offsetBuffer.getInt(currentIndex * 4); +// while (read <= length) { +// String str = clob.getSubString(read, readSize); +// byte[] bytes = str.getBytes(StandardCharsets.UTF_8); +// +// while ((dataBuffer.writerIndex() + bytes.length) > dataBuffer.capacity()) { +// vector.reallocDataBuffer(); +// } +// MemoryUtil.UNSAFE.copyMemory( +// bytes, +// MemoryUtil.BYTE_ARRAY_BASE_OFFSET, +// null, +// dataBuffer.memoryAddress() + startIndex + totalBytes, +// bytes.length); +// +// totalBytes += bytes.length; +// read += readSize; +// } +// offsetBuffer.setInt((currentIndex + 1) * 4, startIndex + totalBytes); +// BitVectorHelper.setBit(vector.getValidityBuffer(), currentIndex); +// vector.setLastSet(currentIndex); +// } +// +// currentIndex++; +// } +// } +//} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/CompositeJdbcConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/CompositeJdbcConsumer.java new file mode 100644 index 0000000..a2d4dd7 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/CompositeJdbcConsumer.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.util.AutoCloseables; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorSchemaRoot; + +import java.io.IOException; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Composite consumer which hold all consumers. + * It manages the consume and cleanup process. + */ +public class CompositeJdbcConsumer implements JdbcConsumer { + + private final JdbcConsumer[] consumers; + + /** + * Construct an instance. + */ + public CompositeJdbcConsumer(JdbcConsumer[] consumers) { + this.consumers = consumers; + } + + @Override + public void consume(ResultSet rs) throws SQLException, IOException { + for (int i = 0; i < consumers.length; i++) { + consumers[i].consume(rs); + } + } + + @Override + public void close() { + + try { + // clean up + AutoCloseables.close(consumers); + } catch (Exception e) { + throw new RuntimeException("Error occurred while releasing resources.", e); + } + + } + + @Override + public void resetValueVector(ValueVector vector) { + + } + + /** + * Reset inner consumers through vectors in the vector schema root. + */ + public void resetVectorSchemaRoot(VectorSchemaRoot root) { + assert root.getFieldVectors().size() == consumers.length; + for (int i = 0; i < consumers.length; i++) { + consumers[i].resetValueVector(root.getVector(i)); + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DateConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DateConsumer.java new file mode 100644 index 0000000..a6c207a --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DateConsumer.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.DateDayVector; +import org.apache.arrow.vector.DateMilliVector; + +import java.sql.Date; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Calendar; +import java.util.concurrent.TimeUnit; + +/** + * Consumer which consume date type values from {@link ResultSet}. + * Write the data to {@link DateDayVector}. + */ +public class DateConsumer { + + /** + * Creates a consumer for {@link DateMilliVector}. + */ + public static JdbcConsumer createConsumer( + DateDayVector vector, int index, boolean nullable, Calendar calendar) { + if (nullable) { + return new NullableDateConsumer(vector, index, calendar); + } else { + return new NonNullableDateConsumer(vector, index, calendar); + } + } + + /** + * Nullable consumer for date. + */ + static class NullableDateConsumer extends BaseConsumer { + + protected final Calendar calendar; + + /** + * Instantiate a DateConsumer. + */ + public NullableDateConsumer(DateDayVector vector, int index) { + this(vector, index, /* calendar */null); + } + + /** + * Instantiate a DateConsumer. + */ + public NullableDateConsumer(DateDayVector vector, int index, Calendar calendar) { + super(vector, index); + this.calendar = calendar; + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) : + resultSet.getDate(columnIndexInResultSet, calendar); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, Math.toIntExact(TimeUnit.MILLISECONDS.toDays(date.getTime()))); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for date. + */ + static class NonNullableDateConsumer extends BaseConsumer { + + protected final Calendar calendar; + + /** + * Instantiate a DateConsumer. + */ + public NonNullableDateConsumer(DateDayVector vector, int index) { + this(vector, index, /* calendar */null); + } + + /** + * Instantiate a DateConsumer. + */ + public NonNullableDateConsumer(DateDayVector vector, int index, Calendar calendar) { + super(vector, index); + this.calendar = calendar; + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + Date date = calendar == null ? resultSet.getDate(columnIndexInResultSet) : + resultSet.getDate(columnIndexInResultSet, calendar); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, Math.toIntExact(TimeUnit.MILLISECONDS.toDays(date.getTime()))); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DecimalConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DecimalConsumer.java new file mode 100644 index 0000000..5fec433 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DecimalConsumer.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.DecimalVector; + +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume decimal type values from {@link ResultSet}. + * Write the data to {@link DecimalVector}. + */ +public abstract class DecimalConsumer extends BaseConsumer { + private final RoundingMode bigDecimalRoundingMode; + private final int scale; + + /** + * Constructs a new consumer. + * + * @param vector the underlying vector for the consumer. + * @param index the column id for the consumer. + */ + public DecimalConsumer(DecimalVector vector, int index) { + this(vector, index, null); + } + + /** + * Constructs a new consumer, with optional coercibility. + * + * @param vector the underlying vector for the consumer. + * @param index the column index for the consumer. + * @param bigDecimalRoundingMode java.math.RoundingMode to be applied if the BigDecimal scale does not match that + * of the target vector. Set to null to retain strict matching behavior (scale of + * source and target vector must match exactly). + */ + public DecimalConsumer(DecimalVector vector, int index, RoundingMode bigDecimalRoundingMode) { + super(vector, index); + this.bigDecimalRoundingMode = bigDecimalRoundingMode; + this.scale = vector.getScale(); + } + + /** + * Creates a consumer for {@link DecimalVector}. + */ + public static JdbcConsumer createConsumer( + DecimalVector vector, + int index, + boolean nullable, + RoundingMode bigDecimalRoundingMode + ) { + if (nullable) { + return new NullableDecimalConsumer(vector, index, bigDecimalRoundingMode); + } else { + return new NonNullableDecimalConsumer(vector, index, bigDecimalRoundingMode); + } + } + + protected void set(BigDecimal value) { + if (bigDecimalRoundingMode != null && value.scale() != scale) { + value = value.setScale(scale, bigDecimalRoundingMode); + } + vector.set(currentIndex, value); + } + + + /** + * Consumer for nullable decimal. + */ + static class NullableDecimalConsumer extends DecimalConsumer { + + /** + * Instantiate a DecimalConsumer. + */ + public NullableDecimalConsumer(DecimalVector vector, int index, RoundingMode bigDecimalRoundingMode) { + super(vector, index, bigDecimalRoundingMode); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + BigDecimal value = resultSet.getBigDecimal(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + set(value); + } + currentIndex++; + } + } + + /** + * Consumer for non-nullable decimal. + */ + static class NonNullableDecimalConsumer extends DecimalConsumer { + + /** + * Instantiate a DecimalConsumer. + */ + public NonNullableDecimalConsumer(DecimalVector vector, int index, RoundingMode bigDecimalRoundingMode) { + super(vector, index, bigDecimalRoundingMode); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + BigDecimal value = resultSet.getBigDecimal(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + set(value); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DoubleConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DoubleConsumer.java new file mode 100644 index 0000000..1c2ee7e --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/DoubleConsumer.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.Float8Vector; + +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume double type values from {@link ResultSet}. + * Write the data to {@link Float8Vector}. + */ +public class DoubleConsumer { + + /** + * Creates a consumer for {@link Float8Vector}. + */ + public static JdbcConsumer createConsumer(Float8Vector vector, int index, boolean nullable) { + if (nullable) { + return new NullableDoubleConsumer(vector, index); + } else { + return new NonNullableDoubleConsumer(vector, index); + } + } + + /** + * Nullable double consumer. + */ + static class NullableDoubleConsumer extends BaseConsumer { + + /** + * Instantiate a DoubleConsumer. + */ + public NullableDoubleConsumer(Float8Vector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + double value = resultSet.getDouble(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + } + currentIndex++; + } + } + + /** + * Non-nullable double consumer. + */ + static class NonNullableDoubleConsumer extends BaseConsumer { + + /** + * Instantiate a DoubleConsumer. + */ + public NonNullableDoubleConsumer(Float8Vector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + double value = resultSet.getDouble(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/FloatConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/FloatConsumer.java new file mode 100644 index 0000000..641b7ea --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/FloatConsumer.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.Float4Vector; + +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume float type values from {@link ResultSet}. + * Write the data to {@link Float4Vector}. + */ +public class FloatConsumer { + + /** + * Creates a consumer for {@link Float4Vector}. + */ + public static JdbcConsumer createConsumer(Float4Vector vector, int index, boolean nullable) { + if (nullable) { + return new NullableFloatConsumer(vector, index); + } else { + return new NonNullableFloatConsumer(vector, index); + } + } + + /** + * Nullable float consumer. + */ + static class NullableFloatConsumer extends BaseConsumer { + + /** + * Instantiate a FloatConsumer. + */ + public NullableFloatConsumer(Float4Vector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + float value = resultSet.getFloat(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + } + currentIndex++; + } + } + + /** + * Non-nullable float consumer. + */ + static class NonNullableFloatConsumer extends BaseConsumer { + + /** + * Instantiate a FloatConsumer. + */ + public NonNullableFloatConsumer(Float4Vector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + float value = resultSet.getFloat(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/IntConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/IntConsumer.java new file mode 100644 index 0000000..954dc66 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/IntConsumer.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.IntVector; + +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume int type values from {@link ResultSet}. + * Write the data to {@link IntVector}. + */ +public class IntConsumer { + + /** + * Creates a consumer for {@link IntVector}. + */ + public static JdbcConsumer createConsumer(IntVector vector, int index, boolean nullable) { + if (nullable) { + return new NullableIntConsumer(vector, index); + } else { + return new NonNullableIntConsumer(vector, index); + } + } + + /** + * Nullable consumer for int. + */ + static class NullableIntConsumer extends BaseConsumer { + + /** + * Instantiate a IntConsumer. + */ + public NullableIntConsumer(IntVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + int value = resultSet.getInt(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for int. + */ + static class NonNullableIntConsumer extends BaseConsumer { + + /** + * Instantiate a IntConsumer. + */ + public NonNullableIntConsumer(IntVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + int value = resultSet.getInt(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/JdbcConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/JdbcConsumer.java new file mode 100644 index 0000000..78ec387 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/JdbcConsumer.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.ValueVector; + +import java.io.IOException; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * An abstraction that is used to consume values from {@link ResultSet}. + * + * @param The vector within consumer or its delegate, used for partially consume purpose. + */ +public interface JdbcConsumer extends AutoCloseable { + + /** + * Consume a specific type value from {@link ResultSet} and write it to vector. + */ + void consume(ResultSet resultSet) throws SQLException, IOException; + + /** + * Close this consumer, do some clean work such as clear reuse ArrowBuf. + */ + void close() throws Exception; + + /** + * Reset the vector within consumer for partial read purpose. + */ + void resetValueVector(T vector); +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/MapConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/MapConsumer.java new file mode 100644 index 0000000..26df514 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/MapConsumer.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.impl.UnionMapWriter; +import org.apache.arrow.vector.util.ObjectMapperFactory; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.Map; + +/** + * Consumer which consume map type values from {@link ResultSet}. + * Write the data into {@link MapVector}. + */ +public class MapConsumer extends BaseConsumer { + + + private final UnionMapWriter writer; + private final ObjectMapper objectMapper = ObjectMapperFactory.newObjectMapper(); + private final TypeReference> typeReference = new TypeReference>() { + }; + private int currentRow; + + /** + * Instantiate a MapConsumer. + */ + public MapConsumer(MapVector vector, int index) { + super(vector, index); + writer = vector.getWriter(); + } + + /** + * Creates a consumer for {@link MapVector}. + */ + public static MapConsumer createConsumer(MapVector mapVector, int index, boolean nullable) { + return new MapConsumer(mapVector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException, IOException { + Object map = resultSet.getObject(columnIndexInResultSet); + writer.setPosition(currentRow++); + if (map != null) { + if (map instanceof String) { + writeJavaMapIntoVector(objectMapper.readValue((String) map, typeReference)); + } else if (map instanceof Map) { + writeJavaMapIntoVector((Map) map); + } else { + throw new IllegalArgumentException("Unknown type of map type column from JDBC " + map.getClass().getName()); + } + } else { + writer.writeNull(); + } + } + + private void writeJavaMapIntoVector(Map map) { + BufferAllocator allocator = vector.getAllocator(); + writer.startMap(); + map.forEach((key, value) -> { + byte[] keyBytes = key.getBytes(StandardCharsets.UTF_8); + byte[] valueBytes = value != null ? value.getBytes(StandardCharsets.UTF_8) : null; + try ( + ArrowBuf keyBuf = allocator.buffer(keyBytes.length); + ArrowBuf valueBuf = valueBytes != null ? allocator.buffer(valueBytes.length) : null; + ) { + writer.startEntry(); + keyBuf.writeBytes(keyBytes); + writer.key().varChar().writeVarChar(0, keyBytes.length, keyBuf); + if (valueBytes != null) { + valueBuf.writeBytes(valueBytes); + writer.value().varChar().writeVarChar(0, valueBytes.length, valueBuf); + } else { + writer.value().varChar().writeNull(); + } + writer.endEntry(); + } + }); + writer.endMap(); + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/NullConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/NullConsumer.java new file mode 100644 index 0000000..4fd3ed7 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/NullConsumer.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.NullVector; + +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume null type values from ResultSet. + * Corresponding to {@link NullVector}. + */ +public class NullConsumer extends BaseConsumer { + + public NullConsumer(NullVector vector) { + super(vector, 0); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/SmallIntConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/SmallIntConsumer.java new file mode 100644 index 0000000..ee4baef --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/SmallIntConsumer.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.SmallIntVector; + +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume smallInt type values from {@link ResultSet}. + * Write the data to {@link SmallIntVector}. + */ +public class SmallIntConsumer { + + /** + * Creates a consumer for {@link SmallIntVector}. + */ + public static BaseConsumer createConsumer(SmallIntVector vector, int index, boolean nullable) { + if (nullable) { + return new NullableSmallIntConsumer(vector, index); + } else { + return new NonNullableSmallIntConsumer(vector, index); + } + } + + /** + * Nullable consumer for small int. + */ + static class NullableSmallIntConsumer extends BaseConsumer { + + /** + * Instantiate a SmallIntConsumer. + */ + public NullableSmallIntConsumer(SmallIntVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + short value = resultSet.getShort(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for small int. + */ + static class NonNullableSmallIntConsumer extends BaseConsumer { + + /** + * Instantiate a SmallIntConsumer. + */ + public NonNullableSmallIntConsumer(SmallIntVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + short value = resultSet.getShort(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimeConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimeConsumer.java new file mode 100644 index 0000000..bfe5ede --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimeConsumer.java @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.TimeMilliVector; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Time; +import java.util.Calendar; + +/** + * Consumer which consume time type values from {@link ResultSet}. + * Write the data to {@link TimeMilliVector}. + */ +public abstract class TimeConsumer { + + /** + * Creates a consumer for {@link TimeMilliVector}. + */ + public static JdbcConsumer createConsumer( + TimeMilliVector vector, int index, boolean nullable, Calendar calendar) { + if (nullable) { + return new NullableTimeConsumer(vector, index, calendar); + } else { + return new NonNullableTimeConsumer(vector, index, calendar); + } + } + + /** + * Nullable consumer for {@link TimeMilliVector}. + */ + static class NullableTimeConsumer extends BaseConsumer { + + protected final Calendar calendar; + + /** + * Instantiate a TimeConsumer. + */ + public NullableTimeConsumer(TimeMilliVector vector, int index) { + this(vector, index, /* calendar */null); + } + + /** + * Instantiate a TimeConsumer. + */ + public NullableTimeConsumer(TimeMilliVector vector, int index, Calendar calendar) { + super(vector, index); + this.calendar = calendar; + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + Time time = calendar == null ? resultSet.getTime(columnIndexInResultSet) : + resultSet.getTime(columnIndexInResultSet, calendar); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, (int) time.getTime()); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for {@link TimeMilliVector}. + */ + static class NonNullableTimeConsumer extends BaseConsumer { + + protected final Calendar calendar; + + /** + * Instantiate a TimeConsumer. + */ + public NonNullableTimeConsumer(TimeMilliVector vector, int index) { + this(vector, index, /* calendar */null); + } + + /** + * Instantiate a TimeConsumer. + */ + public NonNullableTimeConsumer(TimeMilliVector vector, int index, Calendar calendar) { + super(vector, index); + this.calendar = calendar; + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + Time time = calendar == null ? resultSet.getTime(columnIndexInResultSet) : + resultSet.getTime(columnIndexInResultSet, calendar); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, (int) time.getTime()); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimestampConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimestampConsumer.java new file mode 100644 index 0000000..228b9f2 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimestampConsumer.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.TimeStampMilliVector; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Timestamp; + +/** + * Consumer which consume timestamp type values from {@link ResultSet}. + * Write the data to {@link TimeStampMilliVector}. + */ +public abstract class TimestampConsumer { + + /** + * Creates a consumer for {@link TimeStampMilliVector}. + */ + public static JdbcConsumer createConsumer( + TimeStampMilliVector vector, int index, boolean nullable) { + if (nullable) { + return new NullableTimestampConsumer(vector, index); + } else { + return new NonNullableTimestampConsumer(vector, index); + } + } + + /** + * Nullable consumer for timestamp. + */ + static class NullableTimestampConsumer extends BaseConsumer { + + /** + * Instantiate a TimestampConsumer. + */ + public NullableTimestampConsumer(TimeStampMilliVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + Timestamp timestamp = resultSet.getTimestamp(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, timestamp.getTime()); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for timestamp. + */ + static class NonNullableTimestampConsumer extends BaseConsumer { + + /** + * Instantiate a TimestampConsumer. + */ + public NonNullableTimestampConsumer(TimeStampMilliVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + Timestamp timestamp = resultSet.getTimestamp(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, timestamp.getTime()); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimestampTZConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimestampTZConsumer.java new file mode 100644 index 0000000..3d405a5 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TimestampTZConsumer.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.TimeStampMilliTZVector; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Timestamp; +import java.util.Calendar; + +/** + * Consumer which consume timestamp (with time zone) type values from {@link ResultSet}. + * Write the data to {@link TimeStampMilliTZVector}. + */ +public class TimestampTZConsumer { + /** + * Creates a consumer for {@link TimeStampMilliTZVector}. + */ + public static JdbcConsumer createConsumer( + TimeStampMilliTZVector vector, int index, boolean nullable, Calendar calendar) { + Preconditions.checkArgument(calendar != null, "Calendar cannot be null"); + if (nullable) { + return new NullableTimestampTZConsumer(vector, index, calendar); + } else { + return new NonNullableTimestampConsumer(vector, index, calendar); + } + } + + /** + * Nullable consumer for timestamp (with time zone). + */ + static class NullableTimestampTZConsumer extends BaseConsumer { + + protected final Calendar calendar; + + /** + * Instantiate a TimestampConsumer. + */ + public NullableTimestampTZConsumer(TimeStampMilliTZVector vector, int index, Calendar calendar) { + super(vector, index); + this.calendar = calendar; + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + Timestamp timestamp = resultSet.getTimestamp(columnIndexInResultSet, calendar); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, timestamp.getTime()); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for timestamp (with time zone). + */ + static class NonNullableTimestampConsumer extends BaseConsumer { + + protected final Calendar calendar; + + /** + * Instantiate a TimestampConsumer. + */ + public NonNullableTimestampConsumer(TimeStampMilliTZVector vector, int index, Calendar calendar) { + super(vector, index); + this.calendar = calendar; + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + Timestamp timestamp = resultSet.getTimestamp(columnIndexInResultSet, calendar); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, timestamp.getTime()); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TinyIntConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TinyIntConsumer.java new file mode 100644 index 0000000..3a67697 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/TinyIntConsumer.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.TinyIntVector; + +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume tinyInt type values from {@link ResultSet}. + * Write the data to {@link TinyIntVector}. + */ +public abstract class TinyIntConsumer { + + /** + * Creates a consumer for {@link TinyIntVector}. + */ + public static JdbcConsumer createConsumer(TinyIntVector vector, int index, boolean nullable) { + if (nullable) { + return new NullableTinyIntConsumer(vector, index); + } else { + return new NonNullableTinyIntConsumer(vector, index); + } + } + + /** + * Nullable consumer for tiny int. + */ + static class NullableTinyIntConsumer extends BaseConsumer { + + /** + * Instantiate a TinyIntConsumer. + */ + public NullableTinyIntConsumer(TinyIntVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + byte value = resultSet.getByte(columnIndexInResultSet); + if (!resultSet.wasNull()) { + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for tiny int. + */ + static class NonNullableTinyIntConsumer extends BaseConsumer { + + /** + * Instantiate a TinyIntConsumer. + */ + public NonNullableTinyIntConsumer(TinyIntVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + byte value = resultSet.getByte(columnIndexInResultSet); + // for fixed width vectors, we have allocated enough memory proactively, + // so there is no need to call the setSafe method here. + vector.set(currentIndex, value); + currentIndex++; + } + } +} diff --git a/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/VarCharConsumer.java b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/VarCharConsumer.java new file mode 100644 index 0000000..0801d36 --- /dev/null +++ b/dataproxy-manager/src/main/java/org/secretflow/dataproxy/manager/connector/rdbms/adaptor/consumer/VarCharConsumer.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.manager.connector.rdbms.adaptor.consumer; + +import org.apache.arrow.vector.VarCharVector; + +import java.nio.charset.StandardCharsets; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * Consumer which consume varchar type values from {@link ResultSet}. + * Write the data to {@link VarCharVector}. + */ +public abstract class VarCharConsumer { + + /** + * Creates a consumer for {@link VarCharVector}. + */ + public static JdbcConsumer createConsumer(VarCharVector vector, int index, boolean nullable) { + if (nullable) { + return new NullableVarCharConsumer(vector, index); + } else { + return new NonNullableVarCharConsumer(vector, index); + } + } + + /** + * Nullable consumer for var char. + */ + static class NullableVarCharConsumer extends BaseConsumer { + + /** + * Instantiate a VarCharConsumer. + */ + public NullableVarCharConsumer(VarCharVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + String value = resultSet.getString(columnIndexInResultSet); + if (!resultSet.wasNull()) { + byte[] bytes = value.getBytes(StandardCharsets.UTF_8); + vector.setSafe(currentIndex, bytes); + } + currentIndex++; + } + } + + /** + * Non-nullable consumer for var char. + */ + static class NonNullableVarCharConsumer extends BaseConsumer { + + /** + * Instantiate a VarCharConsumer. + */ + public NonNullableVarCharConsumer(VarCharVector vector, int index) { + super(vector, index); + } + + @Override + public void consume(ResultSet resultSet) throws SQLException { + String value = resultSet.getString(columnIndexInResultSet); + byte[] bytes = value.getBytes(StandardCharsets.UTF_8); + vector.setSafe(currentIndex, bytes); + currentIndex++; + } + } +} diff --git a/dataproxy-server/pom.xml b/dataproxy-server/pom.xml new file mode 100644 index 0000000..696e4a4 --- /dev/null +++ b/dataproxy-server/pom.xml @@ -0,0 +1,77 @@ + + + 4.0.0 + + org.secretflow + dataproxy + 0.0.1-SNAPSHOT + + + dataproxy-server + + + + org.secretflow + dataproxy-manager + + + org.secretflow + dataproxy-service + + + + + javax.validation + validation-api + + + org.hibernate.validator + hibernate-validator + + + + + + org.aspectj + aspectjweaver + + + org.aspectj + aspectjrt + + + + + junit + junit + test + + + + + + + + org.springframework.boot + spring-boot-maven-plugin + + ../target + dataproxy + + + + + + ../config + ${project.basedir}/config + + + ../scripts/test + ${project.basedir}/config + + + + + \ No newline at end of file diff --git a/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/DataProxyApplication.java b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/DataProxyApplication.java new file mode 100644 index 0000000..ec94dc7 --- /dev/null +++ b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/DataProxyApplication.java @@ -0,0 +1,41 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.server; + +import org.springframework.boot.WebApplicationType; +import org.springframework.boot.autoconfigure.SpringBootApplication; +import org.springframework.boot.builder.SpringApplicationBuilder; +import org.springframework.context.annotation.EnableAspectJAutoProxy; +import org.springframework.scheduling.annotation.EnableAsync; +import org.springframework.scheduling.annotation.EnableScheduling; + +/** + * @author muhong + * @date 2023-08-08 7:43 PM + */ +@EnableAsync +@EnableScheduling +@EnableAspectJAutoProxy +@SpringBootApplication(scanBasePackages = "org.secretflow.dataproxy") +public class DataProxyApplication { + + public static void main(String[] args) { + new SpringApplicationBuilder(DataProxyApplication.class) + .web(WebApplicationType.NONE) + .run(args); + } +} diff --git a/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/DataproxyLauncher.java b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/DataproxyLauncher.java new file mode 100644 index 0000000..1bb78be --- /dev/null +++ b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/DataproxyLauncher.java @@ -0,0 +1,87 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.server; + +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.flight.FlightProducer; +import org.apache.arrow.flight.FlightServer; +import org.apache.arrow.flight.Location; +import org.apache.arrow.memory.BufferAllocator; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.CommandLineRunner; +import org.springframework.stereotype.Service; + +/** + * arrow flight服务启动类 + * + * @author muhong + * @date 2023-08-07 10:26 AM + */ +@Slf4j +@Service +public class DataproxyLauncher implements CommandLineRunner { + + private FlightServer flightServer; + + @Autowired + private BufferAllocator bufferAllocator; + + @Autowired + private FlightProducer flightProducer; + + @Autowired + private Location location; + + /** + * GRPC 服务启动方法 + */ + private void grpcStart() { + FlightServer.Builder flightServerBuilder = FlightServer.builder() + .allocator(bufferAllocator) + .middleware(FlightServerTraceMiddleware.getKey(), new FlightServerTraceMiddleware.FlightServerTraceMiddlewareFactory()) + .location(location); + + try (FlightServer server = flightServerBuilder.producer(flightProducer).build()) { + flightServer = server; + flightServer.start(); + log.info("Fastds server launch success, listening on port {}, ip:{}", flightServer.getPort(), location.getUri().getHost()); + flightServer.awaitTermination(); + Runtime.getRuntime().addShutdownHook(new Thread(this::grpcStop)); + } catch (Exception e) { + log.error("fastds launch failed", e); + throw new RuntimeException("Failed to start Flight Server", e); + } + } + + /** + * GRPC 服务Stop方法 + */ + private void grpcStop() { + if (flightServer != null) { + try { + flightServer.close(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public void run(String... args) throws Exception { + grpcStart(); + } +} diff --git a/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/FlightServerTraceMiddleware.java b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/FlightServerTraceMiddleware.java new file mode 100644 index 0000000..f83a8f0 --- /dev/null +++ b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/FlightServerTraceMiddleware.java @@ -0,0 +1,68 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.server; + +import org.secretflow.dataproxy.common.utils.IdUtils; + +import org.apache.arrow.flight.*; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.MDC; + +/** + * trace_id中间件 + * + * @author muhong + * @date 2023-09-25 14:39 + */ +public class FlightServerTraceMiddleware implements FlightServerMiddleware { + + public static final String TRACE_ID_KEY = "trace-id"; + private static final String GENERATE_TRACE_ID_PREFIX = "DP-FLIGHT"; + + public static Key getKey() { + return FlightServerMiddleware.Key.of(FlightServerTraceMiddleware.class.getCanonicalName()); + } + + @Override + public void onBeforeSendingHeaders(CallHeaders outgoingHeaders) { + + } + + @Override + public void onCallCompleted(CallStatus status) { + MDC.remove("TraceId"); + } + + @Override + public void onCallErrored(Throwable err) { + MDC.remove("TraceId"); + } + + public static class FlightServerTraceMiddlewareFactory implements Factory { + @Override + public FlightServerTraceMiddleware onCallStarted(CallInfo info, CallHeaders incomingHeaders, RequestContext context) { + // 设置调用链路 Trace ID + String traceId = incomingHeaders.get(TRACE_ID_KEY); + // 如果未传入 trace id 则生成一个 + if (StringUtils.isEmpty(traceId)) { + traceId = GENERATE_TRACE_ID_PREFIX + "-" + IdUtils.createRandString(32); + } + MDC.put("TraceId", traceId); + return new FlightServerTraceMiddleware(); + } + } +} diff --git a/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/ProtoObjConvertor.java b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/ProtoObjConvertor.java new file mode 100644 index 0000000..ad62cdf --- /dev/null +++ b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/ProtoObjConvertor.java @@ -0,0 +1,270 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.server; + +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.commons.collections4.CollectionUtils; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.FlightContentFormatConfig; +import org.secretflow.dataproxy.common.model.FlightContentFormatTypeEnum; +import org.secretflow.dataproxy.common.model.dataset.Dataset; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.DatasetSchema; +import org.secretflow.dataproxy.common.model.dataset.format.CSVFormatConfig; +import org.secretflow.dataproxy.common.model.dataset.format.DatasetFormatTypeEnum; +import org.secretflow.dataproxy.common.model.dataset.schema.DatasetSchemaTypeEnum; +import org.secretflow.dataproxy.common.model.datasource.DatasetLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.Datasource; +import org.secretflow.dataproxy.common.model.datasource.DatasourceConnConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasourceTypeEnum; +import org.secretflow.dataproxy.common.model.datasource.conn.LocalFileSystemConnConfig; +import org.secretflow.dataproxy.common.model.datasource.conn.MysqlConnConfig; +import org.secretflow.dataproxy.common.model.datasource.conn.ObjectFileSystemConnConfig; +import org.secretflow.dataproxy.common.model.datasource.conn.OdpsConnConfig; +import org.secretflow.dataproxy.common.model.datasource.location.FileSystemLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.location.MysqlLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.v1alpha1.common.Common; +import org.secretflow.v1alpha1.kusciaapi.Domaindata; +import org.secretflow.v1alpha1.kusciaapi.Domaindatasource; +import org.secretflow.v1alpha1.kusciaapi.Flightdm; + +import java.util.stream.Collectors; + +/** + * Kuscia接口转换器 + * + * @author muhong + * @date 2023-08-30 18:55 + */ +public class ProtoObjConvertor { + + /** + * 将 Kuscia gRPC 的数据源元信息转化为数据源元信息 + */ + public static Datasource fromProto(Domaindatasource.DomainDataSource domainDataSource) { + + return Datasource.builder() + .datasourceId(domainDataSource.getDatasourceId()) + .name(domainDataSource.getName()) + .connConfig(fromProto(domainDataSource.getType(), domainDataSource.getInfo())) + .writable(true) + .build(); + } + + public static DatasourceConnConfig fromProto(String domainDataSourceType, Domaindatasource.DataSourceInfo dataSourceInfo) { + switch (domainDataSourceType) { + case "localfs": { + LocalFileSystemConnConfig connConfig = LocalFileSystemConnConfig.builder().build(); + if (dataSourceInfo.hasLocalfs()) { + connConfig.setPath(dataSourceInfo.getLocalfs().getPath()); + } + + return DatasourceConnConfig.builder() + .type(DatasourceTypeEnum.LOCAL_HOST) + .connConfig(connConfig) + .build(); + } + case "oss": { + if (!dataSourceInfo.hasOss()) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_NOT_EXIST_ERROR, "OSS连接信息缺失"); + } + + DatasourceTypeEnum type = null; + switch (dataSourceInfo.getOss().getStorageType()) { + case "oss": + type = DatasourceTypeEnum.OSS; + break; + case "minio": + type = DatasourceTypeEnum.MINIO; + break; + default: + type = DatasourceTypeEnum.OSS; + } + + ObjectFileSystemConnConfig connConfig = ObjectFileSystemConnConfig.builder() + .endpoint(dataSourceInfo.getOss().getEndpoint()) + .bucket(dataSourceInfo.getOss().getBucket()) + .objectKeyPrefix(dataSourceInfo.getOss().getPrefix()) + .accessKey(dataSourceInfo.getOss().getAccessKeyId()) + .accessSecret(dataSourceInfo.getOss().getAccessKeySecret()) + .build(); + return DatasourceConnConfig.builder() + .type(type) + .connConfig(connConfig) + .build(); + } + case "mysql": { + if (!dataSourceInfo.hasDatabase()) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_NOT_EXIST_ERROR, "数据库连接信息缺失"); + } + + MysqlConnConfig connConfig = MysqlConnConfig.builder() + .host(dataSourceInfo.getDatabase().getEndpoint()) + .userName(dataSourceInfo.getDatabase().getUser()) + .password(dataSourceInfo.getDatabase().getPassword()) + .database(dataSourceInfo.getDatabase().getDatabase()) + .build(); + return DatasourceConnConfig.builder() + .type(DatasourceTypeEnum.MYSQL) + .connConfig(connConfig) + .build(); + } + case "odps": { + if (!dataSourceInfo.hasOdps()) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_NOT_EXIST_ERROR, "数据库连接信息缺失"); + } + + OdpsConnConfig config = + OdpsConnConfig.builder() + .accessKeyId(dataSourceInfo.getOdps().getAccessKeyId()) + .accessKeySecret(dataSourceInfo.getOdps().getAccessKeySecret()) + .projectName(dataSourceInfo.getOdps().getProject()) + .endpoint(dataSourceInfo.getOdps().getEndpoint()) + .build(); + + return DatasourceConnConfig.builder() + .type(DatasourceTypeEnum.ODPS) + .connConfig(config) + .build(); + } + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的数据源类型 " + domainDataSourceType); + } + } + + public static Dataset fromProto(Domaindata.DomainData domainData, Datasource datasource) { + DatasetFormatConfig formatConfig = DatasetFormatConfig.builder().build(); + + // 数据集位置信息映射 + DatasetLocationConfig locationConfig = DatasetLocationConfig.builder() + .datasourceId(domainData.getDatasourceId()) + .build(); + switch (datasource.getConnConfig().getType()) { + case LOCAL_HOST: + case OSS: + case MINIO: + case OBS: + locationConfig.setLocationConfig(FileSystemLocationConfig.builder() + .relativePath(domainData.getRelativeUri()) + .build()); + + if (domainData.getFileFormat() == Common.FileFormat.CSV) { + formatConfig.setType(DatasetFormatTypeEnum.CSV); + formatConfig.setFormatConfig(CSVFormatConfig.builder().build()); + } else { + formatConfig.setType(DatasetFormatTypeEnum.BINARY_FILE); + } + break; + case MYSQL: { + locationConfig.setLocationConfig(MysqlLocationConfig.builder() + .table(domainData.getRelativeUri()) + .build()); + formatConfig.setType(DatasetFormatTypeEnum.TABLE); + break; + } + case ODPS: + locationConfig.setLocationConfig(OdpsTableInfo.fromKusciaData(domainData)); + if (domainData.getFileFormat() == Common.FileFormat.CSV ) { + formatConfig.setType(DatasetFormatTypeEnum.TABLE); + } else { + formatConfig.setType(DatasetFormatTypeEnum.BINARY_FILE); + } + break; + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的数据源类型 " + datasource.getConnConfig().getType()); + } + + DatasetSchema datasetSchema = DatasetSchema.builder().build(); + switch (domainData.getType()) { + case "table": { + datasetSchema.setType(DatasetSchemaTypeEnum.STRUCTURED_DATA); + if (CollectionUtils.isNotEmpty(domainData.getColumnsList())) { + + Schema schema = new Schema(domainData.getColumnsList().stream() + .map(column -> + Field.nullable(column.getName(), parseArrowTypeFrom(column.getType()))) + .collect(Collectors.toList())); + datasetSchema.setArrowSchema(schema); + } + break; + } + case "model", "report": { + datasetSchema.setType(DatasetSchemaTypeEnum.BINARY); + break; + } + default: + datasetSchema.setType(DatasetSchemaTypeEnum.BINARY); + break; + } + + return Dataset.builder() + .datasetId(domainData.getDomaindataId()) + .name(domainData.getName()) + .locationConfig(locationConfig) + .schema(datasetSchema) + .formatConfig(formatConfig) + .ownerId(domainData.getVendor()) + .build(); + } + + public static ArrowType parseArrowTypeFrom(String type) { + // string integer float datetime timestamp + return switch (type) { + case "int8" -> Types.MinorType.TINYINT.getType(); + case "int16" -> Types.MinorType.SMALLINT.getType(); + case "int32" -> Types.MinorType.INT.getType(); + case "int64", "int" -> Types.MinorType.BIGINT.getType(); + case "unit8" -> Types.MinorType.UINT1.getType(); + case "uint16" -> Types.MinorType.UINT2.getType(); + case "uint32" -> Types.MinorType.UINT4.getType(); + case "uint64" -> Types.MinorType.UINT8.getType(); + case "float32" -> Types.MinorType.FLOAT4.getType(); + case "float64", "float" -> Types.MinorType.FLOAT8.getType(); + case "date32" -> Types.MinorType.DATEDAY.getType(); + case "date64" -> Types.MinorType.DATEMILLI.getType(); + case "bool" -> Types.MinorType.BIT.getType(); + case "string", "str" -> Types.MinorType.VARCHAR.getType(); + case "binary" -> Types.MinorType.VARBINARY.getType(); + default -> throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的字段类型 " + type); + }; + } + + public static FlightContentFormatConfig fromProto(Flightdm.ContentType contentType) { + FlightContentFormatConfig formatConfig = FlightContentFormatConfig.builder().build(); + + switch (contentType) { + case CSV: + formatConfig.setFormatType(FlightContentFormatTypeEnum.CSV); + formatConfig.setFormatConfig(CSVFormatConfig.builder().build()); + break; + case RAW: + formatConfig.setFormatType(FlightContentFormatTypeEnum.BINARY_FILE); + break; + case Table: + default: + formatConfig.setFormatType(FlightContentFormatTypeEnum.STRUCTURED_DATA); + break; + } + + return formatConfig; + } +} diff --git a/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/config/ArrowConfig.java b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/config/ArrowConfig.java new file mode 100644 index 0000000..c613ee0 --- /dev/null +++ b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/config/ArrowConfig.java @@ -0,0 +1,122 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.server.config; + +import lombok.Data; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.flight.Location; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.commons.lang3.StringUtils; +import org.secretflow.dataproxy.common.utils.JsonUtils; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +import java.io.IOException; +import java.net.Inet4Address; +import java.net.InetAddress; +import java.net.NetworkInterface; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Enumeration; +import java.util.List; + +/** + * @author muhong + * @date 2023-08-07 10:43 AM + */ +@Slf4j +@Configuration +public class ArrowConfig { + + @Value("${dataproxy.flight.port}") + private int defaultPort; + + @Bean + public BufferAllocator bufferAllocator() { + return new RootAllocator(); + } + + @Bean + public Location location() { + try { + String localMachineHost = ""; + final Enumeration interfaces = NetworkInterface.getNetworkInterfaces(); + while (interfaces.hasMoreElements()) { + NetworkInterface networkInterface = interfaces.nextElement(); + if (networkInterface.isLoopback() || !networkInterface.isUp()) { + continue; + } + final Enumeration addresses = networkInterface.getInetAddresses(); + while (addresses.hasMoreElements()) { + InetAddress inetAddress = addresses.nextElement(); + if (!inetAddress.isLoopbackAddress() && inetAddress instanceof Inet4Address) { + localMachineHost = inetAddress.getHostAddress(); + } + } + } + + int port = parsePort(); + return Location.forGrpcInsecure(localMachineHost, port); + } catch (Exception e) { + log.error("config location error", e); + throw new RuntimeException(e); + } + } + + private int parsePort() { + String dpConfigFile = System.getenv("DP_CONFIG_FILE"); + if (StringUtils.isEmpty(dpConfigFile)) { + log.info("dp config file env not found, use default port"); + return defaultPort; + } + + String dpConfigJson = null; + try { + dpConfigJson = Files.readString(Paths.get(dpConfigFile), Charset.defaultCharset()); + } catch (IOException e) { + throw new RuntimeException("dp config file read error", e); + } + + DPConfig dpConfig = JsonUtils.toJavaObject(dpConfigJson, DPConfig.class); + AllocatedPorts allocatedPorts = JsonUtils.toJavaObject(dpConfig.getAllocated_ports(), AllocatedPorts.class); + for (AllocatedPort arrowFlightPort : allocatedPorts.getPorts()) { + if (arrowFlightPort.getName().equals("dp")) { + return arrowFlightPort.getPort(); + } + } + throw new RuntimeException("dp port config not found in " + dpConfigFile); + } + + @Data + private static class DPConfig { + private String allocated_ports; + } + + @Data + private static class AllocatedPorts { + private List ports; + } + + @Data + private static class AllocatedPort { + private String name; + private Integer port; + } +} diff --git a/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/flight/DataproxyProducer.java b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/flight/DataproxyProducer.java new file mode 100644 index 0000000..7ca95ef --- /dev/null +++ b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/flight/DataproxyProducer.java @@ -0,0 +1,51 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.server.flight; + +import org.apache.arrow.flight.FlightDescriptor; +import org.apache.arrow.flight.FlightInfo; +import org.apache.arrow.flight.FlightProducer; +import org.secretflow.v1alpha1.kusciaapi.Flightinner; + +/** + * Dataproxy facade + * + * @author muhong + * @date 2023-09-13 16:05 + */ +public interface DataproxyProducer extends FlightProducer, AutoCloseable { + + /** + * (Query) get flight info + * + * @param command Read command + * @param context Context + * @param descriptor Raw descriptor + * @return + */ + FlightInfo getFlightInfoQuery(Flightinner.CommandDataMeshQuery command, CallContext context, FlightDescriptor descriptor); + + /** + * (Update) get flight info + * + * @param command Update command + * @param context Context + * @param descriptor Raw descriptor + * @return + */ + FlightInfo getFlightInfoUpdate(Flightinner.CommandDataMeshUpdate command, CallContext context, FlightDescriptor descriptor); +} \ No newline at end of file diff --git a/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/flight/DataproxyProducerImpl.java b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/flight/DataproxyProducerImpl.java new file mode 100644 index 0000000..621e2d4 --- /dev/null +++ b/dataproxy-server/src/main/java/org/secretflow/dataproxy/server/flight/DataproxyProducerImpl.java @@ -0,0 +1,357 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.server.flight; + +import com.google.protobuf.Any; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.flight.Action; +import org.apache.arrow.flight.ActionType; +import org.apache.arrow.flight.CallStatus; +import org.apache.arrow.flight.Criteria; +import org.apache.arrow.flight.FlightDescriptor; +import org.apache.arrow.flight.FlightEndpoint; +import org.apache.arrow.flight.FlightInfo; +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.flight.Location; +import org.apache.arrow.flight.PutResult; +import org.apache.arrow.flight.Result; +import org.apache.arrow.flight.Ticket; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.arrow.vector.types.pojo.Schema; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.command.Command; +import org.secretflow.dataproxy.common.model.command.CommandTypeEnum; +import org.secretflow.dataproxy.common.model.command.DatasetReadCommand; +import org.secretflow.dataproxy.common.model.command.DatasetWriteCommand; +import org.secretflow.dataproxy.common.model.dataset.Dataset; +import org.secretflow.dataproxy.common.model.datasource.DatasetLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.Datasource; +import org.secretflow.dataproxy.common.model.datasource.location.OdpsTableInfo; +import org.secretflow.dataproxy.common.utils.GrpcUtils; +import org.secretflow.dataproxy.common.utils.JsonUtils; +import org.secretflow.dataproxy.common.utils.ProtoBufJsonUtils; +import org.secretflow.dataproxy.server.ProtoObjConvertor; +import org.secretflow.dataproxy.service.DataProxyService; +import org.secretflow.dataproxy.service.TicketService; +import org.secretflow.v1alpha1.kusciaapi.Flightdm; +import org.secretflow.v1alpha1.kusciaapi.Flightinner; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * @author muhong + * @date 2023-09-13 16:08 + */ +@Slf4j +@Service +public class DataproxyProducerImpl implements DataproxyProducer { + + private static final Schema DEFAULT_SCHEMA = new Schema(new ArrayList<>()); + @Autowired + private TicketService ticketService; + @Autowired + private DataProxyService dataProxyService; + @Autowired + private Location location; + @Autowired + private BufferAllocator rootAllocator; + + @Override + public FlightInfo getFlightInfo(CallContext context, FlightDescriptor descriptor) { + final Any command = GrpcUtils.parseOrThrow(descriptor.getCommand()); + + try { + if (command.is(Flightinner.CommandDataMeshQuery.class)) { + return getFlightInfoQuery(GrpcUtils.unpackOrThrow(command, Flightinner.CommandDataMeshQuery.class), context, descriptor); + } else if (command.is(Flightinner.CommandDataMeshUpdate.class)) { + return getFlightInfoUpdate(GrpcUtils.unpackOrThrow(command, Flightinner.CommandDataMeshUpdate.class), context, descriptor); + } + } catch (DataproxyException e) { + throw CallStatus.INVALID_ARGUMENT + .withCause(e) + .withDescription(e.getDescription()) + .toRuntimeException(); + } catch (Exception e) { + throw CallStatus.INTERNAL + .withCause(e) + .withDescription("Unknown exception") + .toRuntimeException(); + } + + log.error("[getFlightInfo] unrecognized request, type:{}", command.getTypeUrl()); + throw CallStatus.INVALID_ARGUMENT + .withDescription("Unrecognized request: " + command.getTypeUrl()) + .toRuntimeException(); + } + + @Override + public void doAction(CallContext context, Action action, StreamListener listener) { + final Any actionBody = GrpcUtils.parseOrThrow(action.getBody()); + + Result result = null; + try { + + } catch (DataproxyException e) { + throw CallStatus.INVALID_ARGUMENT + .withCause(e) + .withDescription(e.getDescription()) + .toRuntimeException(); + } catch (Exception e) { + throw CallStatus.INTERNAL + .withCause(e) + .withDescription("Unknown exception") + .toRuntimeException(); + } + + if (result != null) { + listener.onNext(result); + listener.onCompleted(); + return; + } + + log.error("[doAction] unrecognized request"); + throw CallStatus.INVALID_ARGUMENT + .withDescription("Unrecognized request: " + actionBody.getTypeUrl()) + .toRuntimeException(); + } + + @Override + public void getStream(CallContext context, Ticket ticket, ServerStreamListener listener) { + try { + getStreamReadData(context, ticket, listener); + } catch (DataproxyException e) { + throw CallStatus.INVALID_ARGUMENT + .withCause(e) + .withDescription(e.getDescription()) + .toRuntimeException(); + } catch (Exception e) { + log.error("[getStream] unknown exception"); + throw CallStatus.INTERNAL + .withCause(e) + .withDescription("Unknown exception") + .toRuntimeException(); + } + } + + @Override + public void listFlights(CallContext context, Criteria criteria, StreamListener listener) { + + } + + @Override + public Runnable acceptPut(CallContext context, FlightStream flightStream, StreamListener ackStream) { + try { + return acceptPutDataUpdate(context, flightStream, ackStream); + } catch (DataproxyException e) { + throw CallStatus.INVALID_ARGUMENT + .withCause(e) + .withDescription(e.getDescription()) + .toRuntimeException(); + } catch (Exception e) { + log.error("[acceptPut] unknown exception"); + throw CallStatus.INTERNAL + .withCause(e) + .withDescription("Unknown exception") + .toRuntimeException(); + } + } + + @Override + public void listActions(CallContext context, StreamListener listener) { + // no implements + } + + @Override + public void close() throws Exception { + + } + + @Override + public FlightInfo getFlightInfoQuery(Flightinner.CommandDataMeshQuery command, CallContext context, FlightDescriptor descriptor) { + log.info("[getFlightInfoQuery] get flight info query start"); + + try { + Datasource datasource = ProtoObjConvertor.fromProto(command.getDatasource()); + Dataset dataset = ProtoObjConvertor.fromProto(command.getDomaindata(), datasource); + + // TODO: 不合理入参 + DatasetLocationConfig locationConfig = dataset.getLocationConfig(); + if (locationConfig.getLocationConfig() instanceof OdpsTableInfo odpsTableInfo) { + String partitionSpec = command.getQuery().getPartitionSpec(); + locationConfig.setLocationConfig(new OdpsTableInfo(odpsTableInfo.tableName(), partitionSpec, odpsTableInfo.fields())); + } + + Command readCommand = Command.builder() + .type(CommandTypeEnum.READ) + .commandInfo(DatasetReadCommand.builder() + .connConfig(datasource.getConnConfig()) + .locationConfig(locationConfig) + .formatConfig(dataset.getFormatConfig()) + .schema(dataset.getSchema().getArrowSchema()) + .fieldList(command.getQuery().getColumnsList()) + .outputFormatConfig(ProtoObjConvertor.fromProto(command.getQuery().getContentType())) + .build()) + .build(); + + log.info("[getFlightInfoQuery] get flight info query, command:{}", JsonUtils.toJSONString(readCommand)); + + byte[] ticketBytes = ticketService.generateTicket(readCommand); + + // 数据端,当前只支持1 + List endpointList = Collections.singletonList( + new FlightEndpoint(new Ticket(ticketBytes), location)); + + log.info("[getFlightInfoQuery] get flight info query completed"); + return new FlightInfo(DEFAULT_SCHEMA, descriptor, endpointList, 0, 0); + } catch (DataproxyException e) { + log.error("[getFlightInfoQuery] get flight info query error", e); + throw e; + } catch (Exception e) { + log.error("[getFlightInfoQuery] get flight info query unknown exception", e); + throw DataproxyException.of(DataproxyErrorCode.KUSCIA_GET_FLIGHT_INFO_QUERY_ERROR, e); + } + } + + @Override + public FlightInfo getFlightInfoUpdate(Flightinner.CommandDataMeshUpdate command, CallContext context, FlightDescriptor descriptor) { + log.info("[getFlightInfoUpdate] get flight info update start"); + + try { + Datasource datasource = ProtoObjConvertor.fromProto(command.getDatasource()); + Dataset dataset = ProtoObjConvertor.fromProto(command.getDomaindata(), datasource); + + // TODO: 不合理入参 + DatasetLocationConfig locationConfig = dataset.getLocationConfig(); + if (locationConfig.getLocationConfig() instanceof OdpsTableInfo odpsTableInfo) { + String partitionSpec = command.getUpdate().getPartitionSpec(); + locationConfig.setLocationConfig(new OdpsTableInfo(odpsTableInfo.tableName(), partitionSpec, odpsTableInfo.fields())); + } + + Command writeCommand = Command.builder() + .type(CommandTypeEnum.WRITE) + .commandInfo(DatasetWriteCommand.builder() + .connConfig(datasource.getConnConfig()) + .locationConfig(locationConfig) + .formatConfig(dataset.getFormatConfig()) + .schema(dataset.getSchema().getArrowSchema()) + .inputFormatConfig(ProtoObjConvertor.fromProto(command.getUpdate().getContentType())) + .extraOptions(command.getUpdate().getExtraOptionsMap()) + .build()) + .build(); + + log.info("[getFlightInfoUpdate] get flight info update, command:{}", JsonUtils.toJSONString(writeCommand)); + + byte[] ticketBytes = ticketService.generateTicket(writeCommand); + Flightdm.TicketDomainDataQuery commandTicketWrite = Flightdm.TicketDomainDataQuery.newBuilder() + .setDomaindataHandle(new String(ticketBytes)) + .build(); + + // 数据端,当前只支持1 + List endpointList = Collections.singletonList( + new FlightEndpoint(new Ticket(Any.pack(commandTicketWrite).toByteArray()), location)); + + log.info("[getFlightInfoUpdate] get flight info update completed"); + return new FlightInfo(DEFAULT_SCHEMA, descriptor, endpointList, 0, 0); + } catch (DataproxyException e) { + log.error("[getFlightInfoUpdate] get flight info update error", e); + throw e; + } catch (Exception e) { + log.error("[getFlightInfoUpdate] get flight info update unknown exception", e); + throw DataproxyException.of(DataproxyErrorCode.KUSCIA_GET_FLIGHT_INFO_UPDATE_ERROR, e); + } + } + + public void getStreamReadData(CallContext context, Ticket ticket, ServerStreamListener listener) { + log.info("[getStreamReadData] get stream start, ticket:{}", new String(ticket.getBytes())); + + try { + // 根据ticket获取预先缓存的查询命令 + Command command = ticketService.getCommandByTicket(ticket.getBytes()); + if (command.getType() != CommandTypeEnum.READ) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "操作指令类型与接口不匹配"); + } + + log.info("[getStreamReadData] parse command from ticket success, command:{}", JsonUtils.toJSONString(command)); + try (ArrowReader arrowReader = dataProxyService.generateArrowReader(rootAllocator, (DatasetReadCommand) command.getCommandInfo())) { + listener.start(arrowReader.getVectorSchemaRoot()); + while (arrowReader.loadNextBatch()) { + listener.putNext(); + } + listener.completed(); + log.info("[getStreamReadData] get stream completed"); + } + } catch (DataproxyException e) { + log.error("[getStreamReadData] get stream error", e); + throw e; + } catch (Exception e) { + log.error("[getStreamReadData] get stream unknown exception", e); + throw DataproxyException.of(DataproxyErrorCode.KUSCIA_GET_STREAM_ERROR, e); + } + } + + public Runnable acceptPutDataUpdate(CallContext context, FlightStream flightStream, StreamListener ackStream) { + log.info("[acceptPutDataUpdate] accept put data (update) start"); + + try { + final Any acceptPutCommand = GrpcUtils.parseOrThrow(flightStream.getDescriptor().getCommand()); + if (!acceptPutCommand.is(Flightdm.TicketDomainDataQuery.class)) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "操作指令类型与接口不匹配"); + } + + Flightdm.TicketDomainDataQuery ticketDomainDataQuery = GrpcUtils.unpackOrThrow(acceptPutCommand, Flightdm.TicketDomainDataQuery.class); + log.info("[acceptPutDataUpdate] parse ticketDomainDataQuery success, ticketDomainDataQuery:{}", ProtoBufJsonUtils.toJSONString(ticketDomainDataQuery)); + + Command command = ticketService.getCommandByTicket(ticketDomainDataQuery.getDomaindataHandle().getBytes()); + log.info("[acceptPutDataUpdate] parse command from ticket success, command:{}", JsonUtils.toJSONString(command)); + if (command.getType() != CommandTypeEnum.WRITE) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "操作指令类型与接口不匹配"); + } + + return () -> { + try { + dataProxyService.datasetWrite((DatasetWriteCommand) command.getCommandInfo(), flightStream, + root -> { + String msg = "row count: " + root.getRowCount(); + try (final ArrowBuf buffer = rootAllocator.buffer(msg.getBytes(StandardCharsets.UTF_8).length)) { + buffer.writeBytes(msg.getBytes(StandardCharsets.UTF_8)); + ackStream.onNext(PutResult.metadata(buffer)); + } + }); + } catch (DataproxyException e) { + throw CallStatus.INTERNAL + .withCause(e) + .withDescription(e.getDescription()) + .toRuntimeException(); + } + }; + } catch (DataproxyException e) { + log.error("[acceptPutDataUpdate] accept put data (update) error", e); + throw e; + } catch (Exception e) { + log.error("[acceptPutDataUpdate] accept put data (update) unknown exception", e); + throw DataproxyException.of(DataproxyErrorCode.KUSCIA_ACCEPT_PUT_ERROR, e); + } + } +} diff --git a/dataproxy-server/src/main/resources/application.yaml b/dataproxy-server/src/main/resources/application.yaml new file mode 100644 index 0000000..080afba --- /dev/null +++ b/dataproxy-server/src/main/resources/application.yaml @@ -0,0 +1,26 @@ +spring: + # profiles: + # active: local + autoconfigure: + exclude: org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration + application: + name: dataproxy + servlet: + multipart: + max-file-size: -1 + max-request-size: -1 + file-size-threshold: -1 + +logging: + level: + root: info + file: + path: "./logs" + +dataproxy: + flight: + host: 127.0.0.1 # getFlightInfo 返回的endpoint ip + port: 8023 + ticket: + timeout: 300 # 过期时间,单位秒 + onlyOnce: true # 是否一次性,true:一次性使用,false:允许多次调用,超时销毁 \ No newline at end of file diff --git a/dataproxy-server/src/main/resources/logback-spring.xml b/dataproxy-server/src/main/resources/logback-spring.xml new file mode 100644 index 0000000..56211f5 --- /dev/null +++ b/dataproxy-server/src/main/resources/logback-spring.xml @@ -0,0 +1,112 @@ + + + + + + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} %X{TraceId} - %msg%n + + + + + + true + + ERROR + + ${logging.file.path}/${application.name}/common-error.log + + ${logging.file.path}/${application.name}/common-error.%d{yyyy-MM-dd}.%i.log + + 100MB + 30 + 512MB + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} [%method] %X{TraceId} - + %msg%n + + UTF-8 + + + + + true + + ${logging.level} + + ${logging.file.path}/${application.name}/common-default.log + + ${logging.file.path}/${application.name}/common-default.%d{yyyy-MM-dd}.%i.log + + 100MB + 30 + 512MB + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} %X{TraceId} - + %msg%n + + UTF-8 + + + + + true + + ${logging.level} + + ${logging.file.path}/${application.name}/fastdf-dataproxy-async-task.log + + + ${logging.file.path}/${application.name}/fastdf-dataproxy-async-task.%d{yyyy-MM-dd}.%i.log + + 100MB + 30 + 512MB + + + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{50} %X{TraceId} - + %msg%n + + UTF-8 + + + + + + + + + true + + ${logging.level} + + ${logging.file.path}/${application.name}/security-info.log + + ${logging.file.path}/${application.name}/security-info.%d{yyyy-MM-dd}.%i.log + + 100MB + 30 + 512MB + + + + %d [%X{traceId} %X{rpcId} - %X{loginUserEmail}/%X{loginUserID}/%X{remoteAddr}/%X{clientId} - + %X{requestURIWithQueryString}] %-5p %c{2} - %msg%n + + UTF-8 + + + + + + + + + + diff --git a/dataproxy-service/pom.xml b/dataproxy-service/pom.xml new file mode 100644 index 0000000..bb2f2cd --- /dev/null +++ b/dataproxy-service/pom.xml @@ -0,0 +1,35 @@ + + + 4.0.0 + + org.secretflow + dataproxy + 0.0.1-SNAPSHOT + + + dataproxy-service + + + + org.secretflow + dataproxy-common + + + org.secretflow + dataproxy-manager + + + com.github.ben-manes.caffeine + caffeine + + + + junit + junit + + + + + \ No newline at end of file diff --git a/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/DataProxyService.java b/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/DataProxyService.java new file mode 100644 index 0000000..42a0e0e --- /dev/null +++ b/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/DataProxyService.java @@ -0,0 +1,89 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.service; + +import org.secretflow.dataproxy.common.model.InferSchemaResult; +import org.secretflow.dataproxy.common.model.command.DatasetReadCommand; +import org.secretflow.dataproxy.common.model.command.DatasetWriteCommand; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasetLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasourceConnConfig; +import org.secretflow.dataproxy.manager.Connector; + +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; + +/** + * @author muhong + * @date 2023-09-01 11:01 + */ +public interface DataProxyService { + + /** + * 构造数据源连接器 + * + * @param connConfig 数据源连接信息 + * @return 数据源连接器 + */ + Connector buildConnector(DatasourceConnConfig connConfig); + + /** + * 校验数据源连接参数 + * + * @param connConfig 数据源连接信息 + */ + + void validateConn(DatasourceConnConfig connConfig); + + /** + * 推断数据结构 + * + * @param allocator 内存分配器 + * @param connConfig 数据源连接信息 + * @param locationConfig 数据集位置信息 + * @param formatConfig 数据集格式信息 + * @return 数据结构及详细格式信息 + */ + InferSchemaResult inferSchema(BufferAllocator allocator, DatasourceConnConfig connConfig, DatasetLocationConfig locationConfig, DatasetFormatConfig formatConfig); + + /** + * 数据读取 + * + * @param allocator 内存分配器 + * @param readCommand 数据读取指令 + * @return Arrow流式数据读取对象 + */ + ArrowReader generateArrowReader(BufferAllocator allocator, DatasetReadCommand readCommand); + + /** + * 数据存储 + * + * @param writeCommand 数据存储指令 + * @param flightStream 待存储Arrow数据流 + * @param writeCallback 单块存储完成回调 + */ + void datasetWrite(DatasetWriteCommand writeCommand, FlightStream flightStream, WriteCallback writeCallback); + + /** + * 单块数据存储完成回调 + */ + interface WriteCallback { + void ack(VectorSchemaRoot root); + } +} \ No newline at end of file diff --git a/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/TicketService.java b/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/TicketService.java new file mode 100644 index 0000000..30a5aa7 --- /dev/null +++ b/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/TicketService.java @@ -0,0 +1,42 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.service; + +import org.secretflow.dataproxy.common.model.command.Command; + +/** + * @author muhong + * @date 2023-08-31 11:02 + */ +public interface TicketService { + + /** + * 根据指令生成ticket + * + * @param command 数据指令 + * @return ticket + */ + byte[] generateTicket(Command command); + + /** + * 根据ticket获取数据指令 + * + * @param ticket ticket + * @return 数据指令 + */ + Command getCommandByTicket(byte[] ticket); +} \ No newline at end of file diff --git a/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/impl/DataProxyServiceDirectImpl.java b/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/impl/DataProxyServiceDirectImpl.java new file mode 100644 index 0000000..2383660 --- /dev/null +++ b/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/impl/DataProxyServiceDirectImpl.java @@ -0,0 +1,184 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.service.impl; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import lombok.extern.slf4j.Slf4j; +import org.apache.arrow.flight.FlightStream; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowReader; +import org.apache.commons.collections4.CollectionUtils; +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.InferSchemaResult; +import org.secretflow.dataproxy.common.model.command.DatasetReadCommand; +import org.secretflow.dataproxy.common.model.command.DatasetWriteCommand; +import org.secretflow.dataproxy.common.model.dataset.DatasetFormatConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasetLocationConfig; +import org.secretflow.dataproxy.common.model.datasource.DatasourceConnConfig; +import org.secretflow.dataproxy.common.model.datasource.conn.JdbcBaseConnConfig; +import org.secretflow.dataproxy.common.utils.JsonUtils; +import org.secretflow.dataproxy.manager.Connector; +import org.secretflow.dataproxy.manager.DataReader; +import org.secretflow.dataproxy.manager.DataWriter; +import org.secretflow.dataproxy.manager.connector.filesystem.FileSystemConnector; +import org.secretflow.dataproxy.manager.connector.odps.OdpsConnector; +import org.secretflow.dataproxy.manager.connector.rdbms.JdbcConnector; +import org.secretflow.dataproxy.service.DataProxyService; +import org.springframework.stereotype.Service; + +import javax.annotation.PostConstruct; + +/** + * 简单数据处理中心实现(数据直传) + * + * @author muhong + * @date 2023-09-01 17:12 + */ +@Slf4j +@Service +public class DataProxyServiceDirectImpl implements DataProxyService { + + protected Cache connectorCache; + + @PostConstruct + private void init() { + connectorCache = Caffeine.newBuilder() + .maximumSize(100) + .removalListener((key, connector, cause) -> { + if (connector != null) { + try { + ((Connector) connector).close(); + log.info("[DataProxyServiceDirectImpl] remove item from connector cache success, cause:{}, key:{}", cause, key); + } catch (Exception e) { + log.error("[DataProxyServiceDirectImpl] remove item from connector cache failed, because connector close failed, conn config: {}", + key, e); + } + } + }) + .build(); + } + + /** + * 构建数据源连接器 + * + * @param connConfig 数据源连接信息 + * @return 数据源连接器 + */ + @Override + public synchronized Connector buildConnector(DatasourceConnConfig connConfig) { + String key = connConfig.generateUniqueId(); + + Connector connector = connectorCache.getIfPresent(key); + if (connector != null) { + if (connector.isAvailable()) { + return connector; + } else { + connectorCache.invalidate(key); + } + } + + if (connConfig.getType() == null) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_NOT_EXIST_ERROR, "数据源类型字段缺失"); + } + + switch (connConfig.getType()) { + case MYSQL: { + // 连接信息缺失校验 + if (connConfig.getConnConfig() == null) { + throw DataproxyException.of(DataproxyErrorCode.PARAMS_NOT_EXIST_ERROR, "数据源连接信息字段缺失"); + } + connector = new JdbcConnector(connConfig.getType(), (JdbcBaseConnConfig) connConfig.getConnConfig()); + break; + } + case MINIO: + case OSS: + case OBS: + case LOCAL_HOST: + connector = new FileSystemConnector(connConfig.getType(), connConfig.getConnConfig()); + break; + case ODPS: + connector = new OdpsConnector(connConfig.getConnConfig()); + break; + default: + throw DataproxyException.of(DataproxyErrorCode.PARAMS_UNRELIABLE, "不支持的数据源类型 " + connConfig.getType()); + } + connectorCache.put(key, connector); + return connector; + } + + @Override + public void validateConn(DatasourceConnConfig connConfig) { + // 能构建出connector,就说明连接正常 + buildConnector(connConfig); + } + + @Override + public InferSchemaResult inferSchema(BufferAllocator allocator, DatasourceConnConfig connConfig, DatasetLocationConfig locationConfig, DatasetFormatConfig formatConfig) { + Connector connector = buildConnector(connConfig); + return connector.inferSchema(allocator, locationConfig.getLocationConfig(), formatConfig); + } + + @Override + public ArrowReader generateArrowReader(BufferAllocator allocator, DatasetReadCommand readCommand) { + Connector connector = buildConnector(readCommand.getConnConfig()); + + // 补充formatConfig中缺失参数 + InferSchemaResult inferSchemaResult = inferSchema(allocator, readCommand.getConnConfig(), + readCommand.getLocationConfig(), readCommand.getFormatConfig()); + readCommand.setFormatConfig(inferSchemaResult.getDatasetFormatConfig()); + // 当schema缺省时进行推断 + if (readCommand.getSchema() == null) { + readCommand.setSchema(inferSchemaResult.getSchema()); + } + + DataReader dataReader = connector.buildReader(allocator, readCommand); + return dataReader.createSplitReader(1).get(0).startRead(); + } + + @Override + public void datasetWrite(DatasetWriteCommand writeCommand, FlightStream flightStream, WriteCallback writeCallback) { + + try (Connector connector = buildConnector(writeCommand.getConnConfig())) { + VectorSchemaRoot batch = flightStream.getRoot(); + + if (writeCommand.getSchema() == null || CollectionUtils.isEmpty(writeCommand.getSchema().getFields())) { + writeCommand.setSchema(batch.getSchema()); + } + + try (DataWriter dataWriter = connector.buildWriter(writeCommand)) { + while (flightStream.next()) { + dataWriter.write(batch); + // 调用写回调 + writeCallback.ack(batch); + log.info("[datasetWrite] 数据块存储成功"); + } + dataWriter.flush(); + log.info("[datasetWrite] dataset write over"); + } + } catch (DataproxyException e) { + log.error("[datasetWrite] dataset write error, cmd: {}", JsonUtils.toJSONString(writeCommand), e); + throw e; + } catch (Exception e) { + log.error("[datasetWrite] dataset write unknown exception, cmd: {}", JsonUtils.toJSONString(writeCommand), e); + throw DataproxyException.of(DataproxyErrorCode.DATASET_WRITE_ERROR, e); + } + } + +} \ No newline at end of file diff --git a/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/impl/TicketServiceImpl.java b/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/impl/TicketServiceImpl.java new file mode 100644 index 0000000..8c27b1d --- /dev/null +++ b/dataproxy-service/src/main/java/org/secretflow/dataproxy/service/impl/TicketServiceImpl.java @@ -0,0 +1,92 @@ +/* + * Copyright 2023 Ant Group Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.secretflow.dataproxy.service.impl; + +import org.secretflow.dataproxy.common.exceptions.DataproxyErrorCode; +import org.secretflow.dataproxy.common.exceptions.DataproxyException; +import org.secretflow.dataproxy.common.model.command.Command; +import org.secretflow.dataproxy.common.utils.IdUtils; +import org.secretflow.dataproxy.service.TicketService; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Service; + +import javax.annotation.PostConstruct; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.TimeUnit; + +/** + * ticket服务实现类 + * + * @author muhong + * @date 2023-08-31 11:50 + */ +@Slf4j +@Service +public class TicketServiceImpl implements TicketService { + + /** + * 超时时间 + */ + @Value("${dataproxy.ticket.timeout}") + private int timeout = 300; + + /** + * 是否一次性使用 + */ + @Value("${dataproxy.ticket.onlyOnce}") + private boolean onlyOnce; + + private Cache ticketCache; + + @PostConstruct + private void init() { + // ticket暂时采用本地缓存方式实现 + ticketCache = Caffeine.newBuilder() + .initialCapacity(5) + .maximumSize(10) + // 过期时间为5分钟 + .expireAfterWrite(timeout, TimeUnit.SECONDS) + .build(); + } + + @Override + public byte[] generateTicket(Command command) { + String ticket = IdUtils.randomUUID(); + ticketCache.put(ticket, command); + return ticket.getBytes(StandardCharsets.UTF_8); + } + + @Override + public synchronized Command getCommandByTicket(byte[] ticket) { + String ticketStr = new String(ticket); + + Command command = ticketCache.getIfPresent(ticketStr); + if (command == null) { + throw DataproxyException.of(DataproxyErrorCode.TICKET_UNAVAILABLE); + } + + if (onlyOnce) { + // ticket只允许被消费一次 + ticketCache.invalidate(ticketStr); + } + return command; + } +} diff --git a/dataproxy_sdk/BUILD.bazel b/dataproxy_sdk/BUILD.bazel new file mode 100644 index 0000000..4814ca6 --- /dev/null +++ b/dataproxy_sdk/BUILD.bazel @@ -0,0 +1,24 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("//dataproxy_sdk/bazel:defs.bzl", "dataproxy_cc_library") + +package(default_visibility = ["//visibility:public"]) + +dataproxy_cc_library( + name = "sdk", + deps = [ + "//dataproxy_sdk/cc:dataproxy_sdk_cc", + ], +) diff --git a/dataproxy_sdk/bazel/BUILD.bazel b/dataproxy_sdk/bazel/BUILD.bazel new file mode 100644 index 0000000..d387d0b --- /dev/null +++ b/dataproxy_sdk/bazel/BUILD.bazel @@ -0,0 +1,30 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package(default_visibility = ["//visibility:public"]) + +config_setting( + name = "dataproxy_build_as_release", + values = {"compilation_mode": "opt"}, +) + +config_setting( + name = "dataproxy_build_as_debug", + values = {"compilation_mode": "dbg"}, +) + +config_setting( + name = "dataproxy_build_as_fast", + values = {"compilation_mode": "fastbuild"}, +) diff --git a/dataproxy_sdk/bazel/arrow.BUILD b/dataproxy_sdk/bazel/arrow.BUILD new file mode 100644 index 0000000..ab6c7b3 --- /dev/null +++ b/dataproxy_sdk/bazel/arrow.BUILD @@ -0,0 +1,238 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_proto_grpc//cpp:defs.bzl", "cpp_grpc_compile") + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE.txt"]) + +genrule( + name = "arrow_util_config", + srcs = ["cpp/src/arrow/util/config.h.cmake"], + outs = ["cpp/src/arrow/util/config.h"], + cmd = ("sed " + + "-e 's/@ARROW_VERSION_MAJOR@/9/g' " + + "-e 's/@ARROW_VERSION_MINOR@/0/g' " + + "-e 's/@ARROW_VERSION_PATCH@/0/g' " + + "-e 's/cmakedefine ARROW_USE_NATIVE_INT128/undef ARROW_USE_NATIVE_INT128/g' " + + "-e 's/cmakedefine ARROW_WITH_OPENTELEMETRY/undef ARROW_WITH_OPENTELEMETRY/g' " + + "-e 's/cmakedefine ARROW_GCS/undef ARROW_GCS/g' " + + "-e 's/cmakedefine ARROW_S3/undef ARROW_S3/g' " + + "-e 's/cmakedefine ARROW_JEMALLOC/undef ARROW_JEMALLOC/g' " + + "-e 's/cmakedefine ARROW_JEMALLOC_VENDORED/undef ARROW_JEMALLOC_VENDORED/g' " + + "-e 's/cmakedefine/define/g' " + + "$< >$@"), +) + +genrule( + name = "parquet_version_h", + srcs = ["cpp/src/parquet/parquet_version.h.in"], + outs = ["cpp/src/parquet/parquet_version.h"], + cmd = ("sed " + + "-e 's/@PARQUET_VERSION_MAJOR@/1/g' " + + "-e 's/@PARQUET_VERSION_MINOR@/5/g' " + + "-e 's/@PARQUET_VERSION_PATCH@/1/g' " + + "$< >$@"), +) + +cc_library( + name = "arrow_vendored", + srcs = glob([ + "cpp/src/arrow/vendored/datetime/*.h", + "cpp/src/arrow/vendored/datetime/*.cpp", + "cpp/src/arrow/vendored/pcg/pcg_uint128.hpp", + "cpp/src/arrow/vendored/pcg/pcg_random.hpp", + "cpp/src/arrow/vendored/pcg/pcg_extras.hpp", + "cpp/src/arrow/vendored/uriparser/*.h", + "cpp/src/arrow/vendored/uriparser/*.c", + ]), + includes = [ + "cpp/src", + ], + visibility = ["//visibility:private"], +) + +cc_library( + name = "arrow", + srcs = glob( + [ + "cpp/src/arrow/*.cc", + "cpp/src/arrow/c/*.cc", + "cpp/src/arrow/array/*.cc", + "cpp/src/arrow/csv/*.cc", + "cpp/src/arrow/io/*.cc", + "cpp/src/arrow/extension/**/*.cc", + "cpp/src/arrow/json/**/*.cc", + "cpp/src/arrow/ipc/*.cc", + "cpp/src/arrow/json/*.cc", + "cpp/src/arrow/tensor/*.cc", + "cpp/src/arrow/compute/**/*.cc", + "cpp/src/arrow/util/*.cc", + "cpp/src/arrow/adapters/orc/*.cc", + "cpp/src/arrow/vendored/optional.hpp", + "cpp/src/arrow/vendored/string_view.hpp", + "cpp/src/arrow/vendored/variant.hpp", + "cpp/src/arrow/vendored/base64.cpp", + "cpp/src/arrow/vendored/double-conversion/*.cc", + "cpp/src/arrow/vendored/double-conversion/*.h", + "cpp/src/arrow/**/*.h", + "cpp/src/parquet/**/*.h", + "cpp/src/parquet/**/*.cc", + "cpp/src/generated/*.h", + "cpp/src/generated/*.cpp", + "cpp/thirdparty/flatbuffers/include/flatbuffers/*.h", + ], + exclude = [ + "cpp/src/**/*_benchmark.cc", + "cpp/src/**/*_main.cc", + "cpp/src/**/*_nossl.cc", + "cpp/src/**/*_test.cc", + "cpp/src/**/test_*.h", + "cpp/src/**/test_*.cc", + "cpp/src/**/benchmark_util.h", + "cpp/src/**/benchmark_util.cc", + "cpp/src/**/*hdfs*.cc", + "cpp/src/**/*fuzz*.cc", + "cpp/src/arrow/memory_pool_jemalloc.cc", + "cpp/src/**/file_to_stream.cc", + "cpp/src/**/stream_to_file.cc", + "cpp/src/arrow/dataset/file_orc*", + "cpp/src/arrow/filesystem/gcsfs*.cc", + "cpp/src/arrow/filesystem/s3*.cc", + "cpp/src/arrow/filesystem/*_test_util.cc", + "cpp/src/arrow/util/bpacking_avx2.cc", + "cpp/src/arrow/util/bpacking_avx512.cc", + "cpp/src/arrow/util/bpacking_neon.cc", + "cpp/src/arrow/util/tracing_internal.cc", + "cpp/src/arrow/compute/**/*_avx2.cc", + ], + ), + hdrs = [ + # declare header from above genrule + "cpp/src/arrow/util/config.h", + "cpp/src/parquet/parquet_version.h", + "cpp/src/arrow/dataset/file_orc.h", + ], + copts = [], + defines = [ + "ARROW_WITH_BROTLI", + "ARROW_WITH_SNAPPY", + "ARROW_WITH_LZ4", + "ARROW_WITH_ZLIB", + "ARROW_WITH_ZSTD", + "ARROW_WITH_BZ2", + "ARROW_STATIC", + "ARROW_EXPORT=", + "PARQUET_STATIC", + "PARQUET_EXPORT=", + "WIN32_LEAN_AND_MEAN", + ], + includes = [ + "cpp/src", + "cpp/src/arrow/vendored/xxhash", + "cpp/thirdparty/flatbuffers/include", + ], + textual_hdrs = [ + "cpp/src/arrow/vendored/xxhash/xxhash.c", + ], + deps = [ + ":arrow_vendored", + "@boost//:multiprecision", + "@brotli", + "@bzip2", + "@com_github_facebook_zstd//:zstd", + "@com_github_google_snappy//:snappy", + "@com_github_lz4_lz4//:lz4", + # use openssl instead of boringssl + # "@boringssl//:crypto", + "@com_github_openssl_openssl//:openssl", + "@com_github_tencent_rapidjson//:rapidjson", + "@com_github_xtensor_xsimd//:xsimd", + "@com_google_double_conversion//:double-conversion", + "@org_apache_thrift//:thrift", + "@org_apache_orc//:orc", + "@zlib", + ], +) + +proto_library( + name = "flight_proto", + srcs = ["format/Flight.proto"], + strip_import_prefix = "format", + deps = ["@com_google_protobuf//:timestamp_proto"], +) + +cpp_grpc_compile( + name = "flight_grpc_proto_cc", + prefix_path = "../cpp/src/arrow", + protos = [":flight_proto"], +) + +filegroup( + name = "flight_grpc_proto_cc_files", + srcs = [ + ":flight_grpc_proto_cc", + ], +) + +genrule( + name = "flight_grpc_proto_cc_files_copy", + srcs = [":flight_grpc_proto_cc_files"], + outs = [ + "arrow/flight/Flight.grpc.pb.cc", + "arrow/flight/Flight.grpc.pb.h", + "arrow/flight/Flight.pb.cc", + "arrow/flight/Flight.pb.h", + ], + cmd = "cp $(locations :flight_grpc_proto_cc_files) $(@D)/arrow/flight", +) + +cc_library( + name = "arrow_flight", + srcs = glob( + [ + "cpp/src/arrow/flight/**/*.h", + "cpp/src/arrow/flight/**/*.cc", + ], + exclude = [ + "cpp/src/arrow/flight/**/test_*", + "cpp/src/arrow/flight/sql/**/*", + "cpp/src/arrow/flight/integration_tests/**/*", + "cpp/src/arrow/flight/**/*_test.cc", + "cpp/src/arrow/flight/transport/ucx/*", + "cpp/src/arrow/flight/**/*_benchmark.cc", + "cpp/src/arrow/flight/perf_server.cc", + ], + ) + [ + "arrow/flight/Flight.grpc.pb.cc", + "arrow/flight/Flight.pb.cc", + ], + hdrs = [ + "arrow/flight/Flight.grpc.pb.cc", + "arrow/flight/Flight.grpc.pb.h", + "arrow/flight/Flight.pb.cc", + "arrow/flight/Flight.pb.h", + ], + includes = [ + "cpp/src", + ], + deps = [ + ":arrow", + "@com_github_grpc_grpc//:grpc", + "@com_github_grpc_grpc//:grpc++", + ], +) diff --git a/dataproxy_sdk/bazel/brotli.BUILD b/dataproxy_sdk/bazel/brotli.BUILD new file mode 100644 index 0000000..3c5d0b1 --- /dev/null +++ b/dataproxy_sdk/bazel/brotli.BUILD @@ -0,0 +1,38 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +licenses(["notice"]) # MIT license + +exports_files(["LICENSE"]) + +cc_library( + name = "brotli", + srcs = glob([ + "c/common/*.c", + "c/common/*.h", + "c/dec/*.c", + "c/dec/*.h", + "c/enc/*.c", + "c/enc/*.h", + "c/include/brotli/*.h", + ]), + hdrs = [], + defines = [], + includes = [ + "c/dec", + "c/include", + ], + linkopts = [], + visibility = ["//visibility:public"], +) diff --git a/dataproxy_sdk/bazel/bzip2.BUILD b/dataproxy_sdk/bazel/bzip2.BUILD new file mode 100644 index 0000000..158a332 --- /dev/null +++ b/dataproxy_sdk/bazel/bzip2.BUILD @@ -0,0 +1,39 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# copied from https://github.com/tensorflow/io/blob/v0.25.0/third_party/bzip2.BUILD + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # BSD-like license + +cc_library( + name = "bzip2", + srcs = [ + "blocksort.c", + "bzlib.c", + "bzlib_private.h", + "compress.c", + "crctable.c", + "decompress.c", + "huffman.c", + "randtable.c", + ], + hdrs = [ + "bzlib.h", + ], + copts = [ + ], + includes = ["."], +) diff --git a/dataproxy_sdk/bazel/defs.bzl b/dataproxy_sdk/bazel/defs.bzl new file mode 100644 index 0000000..7aac85c --- /dev/null +++ b/dataproxy_sdk/bazel/defs.bzl @@ -0,0 +1,92 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +warpper bazel cc_xx to modify flags. +""" + +load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test") +load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake") + +WARNING_FLAGS = [ + "-Wall", + "-Wextra", + "-Werror", + "-Wno-unused-parameter", + "-Wnon-virtual-dtor", +] + select({ + "@bazel_tools//src/conditions:darwin": ["-Wunused-const-variable"], + "//conditions:default": ["-Wunused-const-variable=1"], +}) +DEBUG_FLAGS = ["-O0", "-g"] +RELEASE_FLAGS = ["-O2"] +FAST_FLAGS = ["-O1"] + +def _dataproxy_copts(): + return select({ + "@dataproxy//dataproxy_sdk/bazel:dataproxy_build_as_release": RELEASE_FLAGS, + "@dataproxy//dataproxy_sdk/bazel:dataproxy_build_as_debug": DEBUG_FLAGS, + "@dataproxy//dataproxy_sdk/bazel:dataproxy_build_as_fast": FAST_FLAGS, + "//conditions:default": FAST_FLAGS, + }) + WARNING_FLAGS + +def dataproxy_cmake_external(**attrs): + if "generate_args" not in attrs: + attrs["generate_args"] = ["-GNinja"] + return cmake(**attrs) + +def dataproxy_cc_binary( + linkopts = [], + copts = [], + deps = [], + **kargs): + cc_binary( + linkopts = linkopts, + copts = copts + _dataproxy_copts(), + deps = deps, + **kargs + ) + +def dataproxy_cc_library( + linkopts = [], + copts = [], + deps = [], + **kargs): + cc_library( + linkopts = linkopts, + copts = _dataproxy_copts() + copts, + deps = deps + [ + "@com_github_gabime_spdlog//:spdlog", + ], + **kargs + ) + +def dataproxy_cc_test( + linkopts = [], + copts = [], + deps = [], + linkstatic = True, + **kwargs): + cc_test( + # -lm for tcmalloc + linkopts = linkopts + ["-lm"], + copts = _dataproxy_copts() + copts, + deps = deps + [ + # use tcmalloc same as release bins. make them has same behavior on mem. + "@com_google_googletest//:gtest_main", + ], + # static link for tcmalloc + linkstatic = True, + **kwargs + ) diff --git a/dataproxy_sdk/bazel/double_conversion.BUILD b/dataproxy_sdk/bazel/double_conversion.BUILD new file mode 100644 index 0000000..9bdff69 --- /dev/null +++ b/dataproxy_sdk/bazel/double_conversion.BUILD @@ -0,0 +1,31 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake") + +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "all_srcs", + srcs = glob(["**"]), +) + +cmake( + name = "double-conversion", + cache_entries = { + "CMAKE_INSTALL_LIBDIR": "lib", + }, + lib_source = ":all_srcs", + out_static_libs = ["libdouble-conversion.a"], +) diff --git a/dataproxy_sdk/bazel/lz4.BUILD b/dataproxy_sdk/bazel/lz4.BUILD new file mode 100644 index 0000000..3a463e5 --- /dev/null +++ b/dataproxy_sdk/bazel/lz4.BUILD @@ -0,0 +1,37 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@dataproxy//dataproxy_sdk/bazel:defs.bzl", "dataproxy_cmake_external") + +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "all_srcs", + srcs = glob(["**"]), +) + +dataproxy_cmake_external( + name = "lz4", + cache_entries = { + "LZ4_BUILD_CLI": "OFF", + "BUILD_SHARED_LIBS": "OFF", + "BUILD_STATIC_LIBS": "ON", + "CMAKE_INSTALL_LIBDIR": "lib", + }, + lib_source = ":all_srcs", + out_static_libs = [ + "liblz4.a", + ], + working_directory = "build/cmake", +) diff --git a/dataproxy_sdk/bazel/orc.BUILD b/dataproxy_sdk/bazel/orc.BUILD new file mode 100644 index 0000000..3fb6618 --- /dev/null +++ b/dataproxy_sdk/bazel/orc.BUILD @@ -0,0 +1,121 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_proto_grpc//cpp:defs.bzl", "cpp_grpc_compile") + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE.txt"]) + +genrule( + name = "Adaptor_h", + srcs = ["c++/src/Adaptor.hh.in"], + outs = ["c++/src/Adaptor.hh"], + cmd = ("sed " + + "-e 's/cmakedefine HAS_PREAD/define HAS_PREAD/g' " + + "-e 's/cmakedefine HAS_STRPTIME/define HAS_STRPTIME/g' " + + "-e 's/cmakedefine HAS_DIAGNOSTIC_PUSH/define HAS_DIAGNOSTIC_PUSH/g' " + + "-e 's/cmakedefine HAS_DOUBLE_TO_STRING/define HAS_DOUBLE_TO_STRING/g' " + + "-e 's/cmakedefine HAS_INT64_TO_STRING/define HAS_INT64_TO_STRING/g' " + + "-e 's/cmakedefine HAS_PRE_1970/define HAS_PRE_1970/g' " + + "-e 's/cmakedefine HAS_POST_2038/define HAS_POST_2038/g' " + + "-e 's/cmakedefine HAS_STD_ISNAN/define HAS_STD_ISNAN/g' " + + "-e 's/cmakedefine HAS_BUILTIN_OVERFLOW_CHECK/define HAS_BUILTIN_OVERFLOW_CHECK/g' " + + "-e 's/cmakedefine NEEDS_Z_PREFIX/undef NEEDS_Z_PREFIX/g' " + + "$< >$@"), +) + +genrule( + name = "orc-config", + srcs = ["c++/include/orc/orc-config.hh.in"], + outs = ["c++/include/orc/orc-config.hh"], + cmd = ("sed " + + "-e 's/@ORC_VERSION@/1.9.0/g' " + + "-e 's/cmakedefine ORC_CXX_HAS_CSTDINT/undef ORC_CXX_HAS_CSTDINT/g' " + + "$< >$@"), +) + +proto_library( + name = "orc_proto", + srcs = ["proto/orc_proto.proto"], + strip_import_prefix = "proto", +) + +cpp_grpc_compile( + name = "orc_proto_cc", + prefix_path = "../c++", + protos = [":orc_proto"], +) + +filegroup( + name = "orc_proto_cc_files", + srcs = [ + ":orc_proto_cc", + ], +) + +genrule( + name = "orc_proto_cc_file_copy", + srcs = [":orc_proto_cc_files"], + outs = [ + "c++/src/orc_proto.pb.cc", + "c++/src/orc_proto.pb.h", + ], + cmd = "cp $(locations :orc_proto_cc_files) $(@D)/c++/src", +) + +cc_library( + name = "orc", + srcs = glob( + [ + "c++/src/*.cc", + "c++/src/*.hh", + "c++/src/sargs/*.cc", + "c++/src/sargs/*.hh", + "c++/src/io/*.cc", + "c++/src/io/*.hh", + "c++/src/wrap/*.cc", + "c++/src/wrap/*.hh", + "c++/src/wrap/*.h", + ], + exclude = [ + "c++/src/OrcHdfsFile.cc", + "c++/src/BpackingAvx512.cc", + ], + ) + [ + "c++/src/Adaptor.hh", + "c++/src/orc_proto.pb.cc", + ], + hdrs = glob([ + "c++/include/orc/*.hh", + "c++/include/orc/**/*.hh", + ]) + [ + "c++/include/orc/orc-config.hh", + "c++/src/orc_proto.pb.cc", + "c++/src/orc_proto.pb.h", + ], + includes = [ + "c++/include", + "c++/src", + ], + deps = [ + "@com_github_facebook_zstd//:zstd", + "@com_github_google_snappy//:snappy", + "@com_github_lz4_lz4//:lz4", + "@com_google_protobuf//:protobuf", + "@zlib", + ], +) diff --git a/dataproxy_sdk/bazel/patches/grpc.patch b/dataproxy_sdk/bazel/patches/grpc.patch new file mode 100644 index 0000000..91f2458 --- /dev/null +++ b/dataproxy_sdk/bazel/patches/grpc.patch @@ -0,0 +1,32 @@ +diff --git a/bazel/grpc_deps.bzl b/bazel/grpc_deps.bzl +index 5e65a65df4..03bbd2361e 100644 +--- a/bazel/grpc_deps.bzl ++++ b/bazel/grpc_deps.bzl +@@ -57,12 +57,12 @@ def grpc_deps(): + + native.bind( + name = "libssl", +- actual = "@boringssl//:ssl", ++ actual = "@com_github_openssl_openssl//:openssl", + ) + + native.bind( + name = "libcrypto", +- actual = "@boringssl//:crypto", ++ actual = "@com_github_openssl_openssl//:openssl", + ) + + native.bind( +diff --git a/src/core/lib/iomgr/tcp_posix.cc b/src/core/lib/iomgr/tcp_posix.cc +index 72e1b6609e..aded52d0db 100644 +--- a/src/core/lib/iomgr/tcp_posix.cc ++++ b/src/core/lib/iomgr/tcp_posix.cc +@@ -41,6 +41,8 @@ + #include + #include + ++#include "absl/strings/str_cat.h" ++ + #include + #include + #include diff --git a/dataproxy_sdk/bazel/patches/rules_boost.patch b/dataproxy_sdk/bazel/patches/rules_boost.patch new file mode 100644 index 0000000..d74e3d5 --- /dev/null +++ b/dataproxy_sdk/bazel/patches/rules_boost.patch @@ -0,0 +1,42 @@ +diff --git a/config.lzma-linux.h b/config.lzma-linux.h +index e8b00d8..092696f 100644 +--- a/config.lzma-linux.h ++++ b/config.lzma-linux.h +@@ -56,7 +56,9 @@ + /* #undef HAVE_COMMONCRYPTO_COMMONDIGEST_H */ + + /* Define to 1 if you have the header file. */ +-#define HAVE_CPUID_H 1 ++#ifdef __x86_64__ ++ #define HAVE_CPUID_H 1 ++#endif + + /* Define if the GNU dcgettext() function is already present or preinstalled. + */ +@@ -309,7 +311,9 @@ + + /* Define to 1 if _mm_clmulepi64_si128 is usable. See configure.ac for + details. */ ++#ifdef __x86_64__ + #define HAVE_USABLE_CLMUL 1 ++#endif + + /* Define to 1 if you have the `utime' function. */ + /* #undef HAVE_UTIME */ +diff --git a/boost/boost.bzl b/boost/boost.bzl +index 8277dbb..afc9569 100644 +--- a/boost/boost.bzl ++++ b/boost/boost.bzl +@@ -139,9 +139,9 @@ def boost_deps(): + http_archive, + name = "org_lzma_lzma", + build_file = "@com_github_nelhage_rules_boost//:lzma.BUILD", +- url = "https://github.com/tukaani-project/xz/releases/download/v5.4.4/xz-5.4.4.tar.gz", +- sha256 = "aae39544e254cfd27e942d35a048d592959bd7a79f9a624afb0498bb5613bdf8", +- strip_prefix = "xz-5.4.4", ++ url = "https://src.fedoraproject.org/lookaside/extras/xz/xz-5.4.6.tar.gz/sha512/b08a61d8d478d3b4675cb1ddacdbbd98dc6941a55bcdd81a28679e54e9367d3a595fa123ac97874a17da571c1b712e2a3e901c2737099a9d268616a1ba3de497/xz-5.4.6.tar.gz", ++ sha256 = "aeba3e03bf8140ddedf62a0a367158340520f6b384f75ca6045ccc6c0d43fd5c", ++ strip_prefix = "xz-5.4.6", + ) + + maybe( diff --git a/dataproxy_sdk/bazel/rapidjson.BUILD b/dataproxy_sdk/bazel/rapidjson.BUILD new file mode 100644 index 0000000..29d63a1 --- /dev/null +++ b/dataproxy_sdk/bazel/rapidjson.BUILD @@ -0,0 +1,28 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # MIT/JSON license + +cc_library( + name = "rapidjson", + srcs = glob([ + "include/**/*.h", + ]), + copts = [], + includes = [ + "include", + ], +) diff --git a/dataproxy_sdk/bazel/repositories.bzl b/dataproxy_sdk/bazel/repositories.bzl new file mode 100644 index 0000000..425af23 --- /dev/null +++ b/dataproxy_sdk/bazel/repositories.bzl @@ -0,0 +1,278 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") +load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe") + +def dataproxy_deps(): + _bazel_platform() + _bazel_rules_pkg() + _rules_proto_grpc() + + _com_github_nelhage_rules_boost() + _com_github_facebook_zstd() + _org_sourceware_bzip2() + _com_github_google_brotli() + _com_github_lz4_lz4() + _com_github_google_snappy() + _com_google_double_conversion() + _com_github_tencent_rapidjson() + _com_github_xtensor_xsimd() + _org_apache_thrift() + _org_apache_orc() + _org_apache_arrow() + _com_github_pybind11_bazel() + _com_github_pybind11() + _com_github_grpc_grpc() + + _kuscia() + _yacl() + +def _yacl(): + maybe( + http_archive, + name = "yacl", + urls = [ + "https://github.com/secretflow/yacl/archive/refs/tags/0.4.5b2.tar.gz", + ], + strip_prefix = "yacl-0.4.5b2", + sha256 = "b3fb75d41a32b80145a3bb9d36b8c039a262191f1a2f037292c649344289b01b", + ) + +def _kuscia(): + maybe( + http_archive, + name = "kuscia", + urls = [ + "https://github.com/secretflow/kuscia/archive/refs/tags/v0.9.0b0.tar.gz", + ], + strip_prefix = "kuscia-0.9.0b0", + sha256 = "851455f4a3ba70850c8a751a78ebfbbb9fd6d78ec902d0cbf32c2c565d1c8410", + ) + +def _bazel_rules_pkg(): + http_archive( + name = "rules_pkg", + sha256 = "8f9ee2dc10c1ae514ee599a8b42ed99fa262b757058f65ad3c384289ff70c4b8", + urls = [ + "https://github.com/bazelbuild/rules_pkg/releases/download/0.9.1/rules_pkg-0.9.1.tar.gz", + ], + ) + +def _bazel_platform(): + http_archive( + name = "platforms", + urls = [ + "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.8/platforms-0.0.8.tar.gz", + "https://github.com/bazelbuild/platforms/releases/download/0.0.8/platforms-0.0.8.tar.gz", + ], + sha256 = "8150406605389ececb6da07cbcb509d5637a3ab9a24bc69b1101531367d89d74", + ) + +def _org_sourceware_bzip2(): + maybe( + http_archive, + name = "bzip2", + build_file = "@dataproxy//dataproxy_sdk/bazel:bzip2.BUILD", + sha256 = "ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269", + strip_prefix = "bzip2-1.0.8", + urls = [ + "https://sourceware.org/pub/bzip2/bzip2-1.0.8.tar.gz", + ], + ) + +def _com_github_lz4_lz4(): + maybe( + http_archive, + name = "com_github_lz4_lz4", + sha256 = "030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1", + strip_prefix = "lz4-1.9.3", + type = "tar.gz", + build_file = "@dataproxy//dataproxy_sdk/bazel:lz4.BUILD", + urls = [ + "https://codeload.github.com/lz4/lz4/tar.gz/refs/tags/v1.9.3", + ], + ) + +def _com_google_double_conversion(): + maybe( + http_archive, + name = "com_google_double_conversion", + sha256 = "04ec44461850abbf33824da84978043b22554896b552c5fd11a9c5ae4b4d296e", + strip_prefix = "double-conversion-3.3.0", + build_file = "@dataproxy//dataproxy_sdk/bazel:double_conversion.BUILD", + urls = [ + "https://github.com/google/double-conversion/archive/refs/tags/v3.3.0.tar.gz", + ], + ) + +def _com_github_xtensor_xsimd(): + maybe( + http_archive, + name = "com_github_xtensor_xsimd", + sha256 = "d52551360d37709675237d2a0418e28f70995b5b7cdad7c674626bcfbbf48328", + type = "tar.gz", + strip_prefix = "xsimd-8.1.0", + build_file = "@dataproxy//dataproxy_sdk/bazel:xsimd.BUILD", + urls = [ + "https://codeload.github.com/xtensor-stack/xsimd/tar.gz/refs/tags/8.1.0", + ], + ) + +def _com_github_nelhage_rules_boost(): + # use boost 1.83 + RULES_BOOST_COMMIT = "cfa585b1b5843993b70aa52707266dc23b3282d0" + maybe( + http_archive, + name = "com_github_nelhage_rules_boost", + sha256 = "a7c42df432fae9db0587ff778d84f9dc46519d67a984eff8c79ae35e45f277c1", + strip_prefix = "rules_boost-%s" % RULES_BOOST_COMMIT, + patch_args = ["-p1"], + patches = ["@dataproxy//dataproxy_sdk/bazel:patches/rules_boost.patch"], + urls = [ + "https://github.com/nelhage/rules_boost/archive/%s.tar.gz" % RULES_BOOST_COMMIT, + ], + ) + +def _com_github_facebook_zstd(): + maybe( + http_archive, + name = "com_github_facebook_zstd", + build_file = "@dataproxy//dataproxy_sdk/bazel:zstd.BUILD", + strip_prefix = "zstd-1.5.6", + sha256 = "8c29e06cf42aacc1eafc4077ae2ec6c6fcb96a626157e0593d5e82a34fd403c1", + type = ".tar.gz", + urls = [ + "https://github.com/facebook/zstd/releases/download/v1.5.6/zstd-1.5.6.tar.gz", + ], + ) + +def _com_github_google_snappy(): + maybe( + http_archive, + name = "com_github_google_snappy", + sha256 = "75c1fbb3d618dd3a0483bff0e26d0a92b495bbe5059c8b4f1c962b478b6e06e7", + strip_prefix = "snappy-1.1.9", + build_file = "@dataproxy//dataproxy_sdk/bazel:snappy.BUILD", + urls = [ + "https://github.com/google/snappy/archive/refs/tags/1.1.9.tar.gz", + ], + ) + +def _com_github_google_brotli(): + maybe( + http_archive, + name = "brotli", + build_file = "@dataproxy//dataproxy_sdk/bazel:brotli.BUILD", + sha256 = "e720a6ca29428b803f4ad165371771f5398faba397edf6778837a18599ea13ff", + strip_prefix = "brotli-1.1.0", + urls = [ + "https://github.com/google/brotli/archive/refs/tags/v1.1.0.tar.gz", + ], + ) + +def _com_github_tencent_rapidjson(): + maybe( + http_archive, + name = "com_github_tencent_rapidjson", + sha256 = "bf7ced29704a1e696fbccf2a2b4ea068e7774fa37f6d7dd4039d0787f8bed98e", + strip_prefix = "rapidjson-1.1.0", + build_file = "@dataproxy//dataproxy_sdk/bazel:rapidjson.BUILD", + urls = [ + "https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.tar.gz", + ], + ) + +def _org_apache_thrift(): + maybe( + http_archive, + name = "org_apache_thrift", + build_file = "@dataproxy//dataproxy_sdk/bazel:thrift.BUILD", + sha256 = "5da60088e60984f4f0801deeea628d193c33cec621e78c8a43a5d8c4055f7ad9", + strip_prefix = "thrift-0.13.0", + urls = [ + "https://github.com/apache/thrift/archive/v0.13.0.tar.gz", + ], + ) + +def _org_apache_arrow(): + maybe( + http_archive, + name = "org_apache_arrow", + sha256 = "07cdb4da6795487c800526b2865c150ab7d80b8512a31793e6a7147c8ccd270f", + strip_prefix = "arrow-apache-arrow-14.0.2", + build_file = "@dataproxy//dataproxy_sdk/bazel:arrow.BUILD", + urls = [ + "https://github.com/apache/arrow/archive/refs/tags/apache-arrow-14.0.2.tar.gz", + ], + ) + +def _org_apache_orc(): + maybe( + http_archive, + name = "org_apache_orc", + sha256 = "3037fd324a17994f55146aae342531c4a343fdc1ac698c5c6f0f5b7a75ece501", + strip_prefix = "orc-1.9.3", + build_file = "@dataproxy//dataproxy_sdk/bazel:orc.BUILD", + urls = [ + "https://github.com/apache/orc/archive/refs/tags/v1.9.3.tar.gz", + ], + ) + +def _com_github_pybind11_bazel(): + maybe( + http_archive, + name = "pybind11_bazel", + sha256 = "2d3316d89b581966fc11eab9aa9320276baee95c8233c7a8efc7158623a48de0", + strip_prefix = "pybind11_bazel-ff261d2e9190955d0830040b20ea59ab9dbe66c8", + urls = [ + "https://github.com/pybind/pybind11_bazel/archive/ff261d2e9190955d0830040b20ea59ab9dbe66c8.zip", + ], + ) + +def _com_github_pybind11(): + maybe( + http_archive, + name = "pybind11", + build_file = "@pybind11_bazel//:pybind11.BUILD", + sha256 = "d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c", + strip_prefix = "pybind11-2.11.1", + urls = [ + "https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz", + ], + ) + +def _rules_proto_grpc(): + http_archive( + name = "rules_proto_grpc", + sha256 = "928e4205f701b7798ce32f3d2171c1918b363e9a600390a25c876f075f1efc0a", + strip_prefix = "rules_proto_grpc-4.4.0", + urls = [ + "https://github.com/rules-proto-grpc/rules_proto_grpc/releases/download/4.4.0/rules_proto_grpc-4.4.0.tar.gz", + ], + ) + +def _com_github_grpc_grpc(): + maybe( + http_archive, + name = "com_github_grpc_grpc", + sha256 = "7f42363711eb483a0501239fd5522467b31d8fe98d70d7867c6ca7b52440d828", + strip_prefix = "grpc-1.51.0", + type = "tar.gz", + patch_args = ["-p1"], + patches = ["@dataproxy//dataproxy_sdk/bazel:patches/grpc.patch"], + urls = [ + "https://github.com/grpc/grpc/archive/refs/tags/v1.51.0.tar.gz", + ], + ) diff --git a/dataproxy_sdk/bazel/snappy.BUILD b/dataproxy_sdk/bazel/snappy.BUILD new file mode 100644 index 0000000..16d0557 --- /dev/null +++ b/dataproxy_sdk/bazel/snappy.BUILD @@ -0,0 +1,39 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake") + +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "all_srcs", + srcs = glob(["**"]), +) + +cmake( + name = "snappy", + cache_entries = { + "SNAPPY_BUILD_TESTS": "OFF", + "SNAPPY_BUILD_BENCHMARKS": "OFF", + "CMAKE_INSTALL_LIBDIR": "lib", + }, + generate_crosstool_file = False, + install_args = [ + "--prefix $${INSTALLDIR}", + ], + lib_source = ":all_srcs", + out_static_libs = [ + "libsnappy.a", + ], +) diff --git a/dataproxy_sdk/bazel/thrift.BUILD b/dataproxy_sdk/bazel/thrift.BUILD new file mode 100644 index 0000000..13fb1d1 --- /dev/null +++ b/dataproxy_sdk/bazel/thrift.BUILD @@ -0,0 +1,71 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # Apache 2.0 + +exports_files(["LICENSE"]) + +cc_library( + name = "thrift", + srcs = glob([ + "lib/cpp/src/thrift/**/*.h", + ]) + [ + "lib/cpp/src/thrift/protocol/TProtocol.cpp", + "lib/cpp/src/thrift/transport/TBufferTransports.cpp", + "lib/cpp/src/thrift/transport/TTransportException.cpp", + ], + hdrs = [ + "compiler/cpp/src/thrift/version.h", + "lib/cpp/src/thrift/config.h", + ], + includes = [ + "lib/cpp/src", + ], + textual_hdrs = [ + "lib/cpp/src/thrift/protocol/TBinaryProtocol.tcc", + "lib/cpp/src/thrift/protocol/TCompactProtocol.tcc", + ], + deps = [ + "@boost//:units", + ], +) + +genrule( + name = "version_h", + srcs = [ + "compiler/cpp/src/thrift/version.h.in", + ], + outs = [ + "compiler/cpp/src/thrift/version.h", + ], + cmd = "sed 's/@PACKAGE_VERSION@/0.12.0/g' $< > $@", +) + +genrule( + name = "config_h", + srcs = ["build/cmake/config.h.in"], + outs = ["lib/cpp/src/thrift/config.h"], + cmd = ("sed " + + "-e 's/cmakedefine/define/g' " + + "-e 's/$${PACKAGE}/thrift/g' " + + "-e 's/$${PACKAGE_BUGREPORT}//g' " + + "-e 's/$${PACKAGE_NAME}/thrift/g' " + + "-e 's/$${PACKAGE_TARNAME}/thrift/g' " + + "-e 's/$${PACKAGE_URL}//g' " + + "-e 's/$${PACKAGE_VERSION}/0.12.0/g' " + + "-e 's/$${PACKAGE_STRING}/thrift 0.12.0/g' " + + "$< >$@"), +) diff --git a/dataproxy_sdk/bazel/xsimd.BUILD b/dataproxy_sdk/bazel/xsimd.BUILD new file mode 100644 index 0000000..139b549 --- /dev/null +++ b/dataproxy_sdk/bazel/xsimd.BUILD @@ -0,0 +1,45 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +package(default_visibility = ["//visibility:public"]) + +licenses(["notice"]) # BSD 3-Clause + +exports_files(["LICENSE"]) + +cc_library( + name = "xsimd", + srcs = [], + hdrs = glob( + [ + "include/xsimd/*.hpp", + "include/xsimd/config/*.hpp", + "include/xsimd/math/*.hpp", + "include/xsimd/memory/*.hpp", + "include/xsimd/stl/*.hpp", + "include/xsimd/types/*.hpp", + ], + exclude = [ + ], + ), + copts = [], + defines = [], + includes = [ + "include", + ], + linkopts = [], + visibility = ["//visibility:public"], + deps = [ + ], +) diff --git a/dataproxy_sdk/bazel/zstd.BUILD b/dataproxy_sdk/bazel/zstd.BUILD new file mode 100644 index 0000000..cac11e0 --- /dev/null +++ b/dataproxy_sdk/bazel/zstd.BUILD @@ -0,0 +1,38 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@dataproxy//dataproxy_sdk/bazel:defs.bzl", "dataproxy_cmake_external") + +package(default_visibility = ["//visibility:public"]) + +filegroup( + name = "all", + srcs = glob(["**"]), +) + +dataproxy_cmake_external( + name = "zstd", + cache_entries = { + "ZSTD_BUILD_PROGRAMS": "OFF", + "ZSTD_BUILD_SHARED": "OFF", + "ZLIB_BUILD_STATIC": "ON", + "ZSTD_BUILD_TESTS": "OFF", + "ZSTD_MULTITHREAD_SUPPORT": "OFF", + "CMAKE_INSTALL_LIBDIR": "lib", + }, + lib_source = "@com_github_facebook_zstd//:all", + out_include_dir = "include/", + out_static_libs = ["libzstd.a"], + working_directory = "build/cmake", +) diff --git a/dataproxy_sdk/cc/BUILD.bazel b/dataproxy_sdk/cc/BUILD.bazel new file mode 100644 index 0000000..66e7111 --- /dev/null +++ b/dataproxy_sdk/cc/BUILD.bazel @@ -0,0 +1,104 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("//dataproxy_sdk/bazel:defs.bzl", "dataproxy_cc_library", "dataproxy_cc_test") + +package(default_visibility = ["//visibility:public"]) + +dataproxy_cc_library( + name = "dataproxy_sdk_cc", + hdrs = [ + "api.h", + ], + deps = [ + ":data_proxy_file", + ], +) + +dataproxy_cc_library( + name = "exception", + hdrs = [ + "exception.h", + ], + deps = [ + "@yacl//yacl/base:exception", + ], +) + +dataproxy_cc_library( + name = "utils", + srcs = ["utils.cc"], + hdrs = ["utils.h"], + deps = [ + ":exception", + "@org_apache_arrow//:arrow", + ], +) + +dataproxy_cc_library( + name = "proto", + srcs = ["data_proxy_pb.cc"], + hdrs = ["data_proxy_pb.h"], + deps = [ + ":exception", + "//dataproxy_sdk/proto:data_proxy_proto_cc", + "@kuscia//proto/api/v1alpha1/datamesh:flightdm_cc_proto", + ], +) + +dataproxy_cc_library( + name = "file_help", + srcs = ["file_help.cc"], + hdrs = ["file_help.h"], + deps = [ + ":exception", + ":proto", + "@org_apache_arrow//:arrow", + ], +) + +dataproxy_cc_test( + name = "file_help_test", + srcs = ["file_help_test.cc"], + deps = [ + "utils", + ":exception", + ":file_help", + "@org_apache_arrow//:arrow", + ], +) + +dataproxy_cc_library( + name = "data_proxy_conn", + srcs = ["data_proxy_conn.cc"], + hdrs = ["data_proxy_conn.h"], + deps = [ + ":exception", + "@org_apache_arrow//:arrow_flight", + ], +) + +dataproxy_cc_library( + name = "data_proxy_file", + srcs = ["data_proxy_file.cc"], + hdrs = ["data_proxy_file.h"], + deps = [ + ":data_proxy_conn", + ":exception", + ":file_help", + ":proto", + ":utils", + "@org_apache_arrow//:arrow_flight", + ], +) diff --git a/dataproxy_sdk/cc/api.h b/dataproxy_sdk/cc/api.h new file mode 100644 index 0000000..8de3c3b --- /dev/null +++ b/dataproxy_sdk/cc/api.h @@ -0,0 +1,17 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "dataproxy_sdk/cc/data_proxy_file.h" \ No newline at end of file diff --git a/dataproxy_sdk/cc/data_proxy_conn.cc b/dataproxy_sdk/cc/data_proxy_conn.cc new file mode 100644 index 0000000..a76223d --- /dev/null +++ b/dataproxy_sdk/cc/data_proxy_conn.cc @@ -0,0 +1,169 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "data_proxy_conn.h" + +#include + +#include "dataproxy_sdk/cc/exception.h" + +namespace dataproxy_sdk { + +class DataProxyConn::Impl { + private: + struct GetFlightInfoResult { + std::unique_ptr dp_info; + // 单独部署的dp的连接 + std::unique_ptr dp_client; + }; + + public: + void Connect(const std::string& host, bool use_tls, + const arrow::flight::FlightClientOptions& options) { + std::stringstream uri_string; + if (use_tls) { + uri_string << "grpc+tls://" << host; + } else { + uri_string << "grpc+tcp://" << host; + } + arrow::flight::Location location; + ASSIGN_ARROW_OR_THROW(location, + arrow::flight::Location::Parse(uri_string.str())); + + ASSIGN_ARROW_OR_THROW( + dm_client_, arrow::flight::FlightClient::Connect(location, options)); + } + + GetFlightInfoResult GetFlightInfo( + const arrow::flight::FlightDescriptor& descriptor) { + GetFlightInfoResult result; + ASSIGN_ARROW_OR_THROW(result.dp_info, + dm_client_->GetFlightInfo(descriptor)); + + // 2. 获取dp地址 + const arrow::flight::Location& location = + result.dp_info->endpoints().front().locations.front(); + std::string dp_url = location.ToString(); + // 如果dp没有单独部署,后续通过dm进行相关操作 + if (dp_url.find("kuscia://") == std::string::npos) { + ASSIGN_ARROW_OR_THROW(result.dp_client, + arrow::flight::FlightClient::Connect(location)); + } + + return result; + } + + std::unique_ptr DoGet( + const arrow::flight::FlightDescriptor& descriptor) { + GetFlightInfoResult result = GetFlightInfo(descriptor); + + std::unique_ptr dp_client = + std::move(result.dp_client); + std::unique_ptr stream_reader; + if (dp_client) { + ASSIGN_ARROW_OR_THROW( + stream_reader, + dp_client->DoGet(result.dp_info->endpoints().front().ticket)); + } else { + ASSIGN_ARROW_OR_THROW( + stream_reader, + dm_client_->DoGet(result.dp_info->endpoints().front().ticket)); + } + + return std::make_unique(std::move(stream_reader), + std::move(dp_client)); + } + + std::unique_ptr DoPut( + const arrow::flight::FlightDescriptor& descriptor, + std::shared_ptr schema) { + GetFlightInfoResult result = GetFlightInfo(descriptor); + + auto dp_descriptor = arrow::flight::FlightDescriptor::Command( + result.dp_info->endpoints().front().ticket.ticket); + std::unique_ptr dp_client = + std::move(result.dp_client); + arrow::flight::FlightClient::DoPutResult put_result; + if (dp_client) { + ASSIGN_ARROW_OR_THROW(put_result, + dp_client->DoPut(dp_descriptor, schema)); + } else { + ASSIGN_ARROW_OR_THROW(put_result, + dm_client_->DoPut(dp_descriptor, schema)); + } + + return std::make_unique(put_result, + std::move(dp_client)); + } + + std::unique_ptr DoAction( + const arrow::flight::Action& action) { + std::unique_ptr ret; + ASSIGN_ARROW_OR_THROW(ret, dm_client_->DoAction(action)); + return ret; + } + + void Close() { CHECK_ARROW_OR_THROW(dm_client_->Close()); } + + private: + std::unique_ptr dm_client_; +}; + +void DoPutResultWrapper::WriteRecordBatch(const arrow::RecordBatch& batch) { + CHECK_ARROW_OR_THROW(stream_writer_->WriteRecordBatch(batch)); +} + +void DoPutResultWrapper::Close() { + CHECK_ARROW_OR_THROW(stream_writer_->Close()); +} + +std::shared_ptr +FlightStreamReaderWrapper::ReadRecordBatch() { + arrow::flight::FlightStreamChunk chunk; + ASSIGN_ARROW_OR_THROW(chunk, stream_reader_->Next()); + return chunk.data; +} + +DataProxyConn::DataProxyConn() { + impl_ = std::make_unique(); +} +DataProxyConn::~DataProxyConn() = default; + +std::unique_ptr DataProxyConn::Connect( + const std::string& host, bool use_tls, + const arrow::flight::FlightClientOptions& options) { + std::unique_ptr ret = std::make_unique(); + ret->impl_->Connect(host, use_tls, options); + return ret; +} + +std::unique_ptr DataProxyConn::DoPut( + const arrow::flight::FlightDescriptor& descriptor, + std::shared_ptr schema) { + return impl_->DoPut(descriptor, schema); +} + +std::unique_ptr DataProxyConn::DoGet( + const arrow::flight::FlightDescriptor& descriptor) { + return impl_->DoGet(descriptor); +} + +std::unique_ptr DataProxyConn::DoAction( + const arrow::flight::Action& action) { + return impl_->DoAction(action); +} + +void DataProxyConn::Close() { impl_->Close(); } + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/data_proxy_conn.h b/dataproxy_sdk/cc/data_proxy_conn.h new file mode 100644 index 0000000..606fbcd --- /dev/null +++ b/dataproxy_sdk/cc/data_proxy_conn.h @@ -0,0 +1,95 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "arrow/flight/client.h" +#include "arrow/flight/types.h" +#include "arrow/type.h" + +namespace dataproxy_sdk { + +class DoPutResultWrapper { + public: + void WriteRecordBatch(const arrow::RecordBatch& batch); + void Close(); + + public: + DoPutResultWrapper(arrow::flight::FlightClient::DoPutResult& result, + std::unique_ptr client) + : stream_writer_(std::move(result.writer)), + metadata_reader_(std::move(result.reader)), + dp_client_(std::move(client)) {} + ~DoPutResultWrapper() = default; + + private: + // a writer to write record batches to + std::unique_ptr stream_writer_; + // a reader for application metadata from the server + std::unique_ptr metadata_reader_; + // If the dp is deployed alone, use client to manage lifecycle, not for + // public use + std::unique_ptr dp_client_; +}; + +class FlightStreamReaderWrapper { + public: + std::shared_ptr ReadRecordBatch(); + + public: + FlightStreamReaderWrapper( + std::unique_ptr stream, + std::unique_ptr client) + : stream_reader_(std::move(stream)), dp_client_(std::move(client)) {} + ~FlightStreamReaderWrapper() = default; + + private: + std::unique_ptr stream_reader_; + // If the dp is deployed alone, use client to manage lifecycle, not for + // public use + std::unique_ptr dp_client_; +}; + +class DataProxyConn { + public: + static std::unique_ptr Connect( + const std::string& host, bool use_tls, + const arrow::flight::FlightClientOptions& options); + + public: + DataProxyConn(); + ~DataProxyConn(); + + public: + std::unique_ptr DoPut( + const arrow::flight::FlightDescriptor& descriptor, + std::shared_ptr schema); + + std::unique_ptr DoGet( + const arrow::flight::FlightDescriptor& descriptor); + + std::unique_ptr DoAction( + const arrow::flight::Action& action); + + void Close(); + + public: + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/data_proxy_file.cc b/dataproxy_sdk/cc/data_proxy_file.cc new file mode 100644 index 0000000..b4270d3 --- /dev/null +++ b/dataproxy_sdk/cc/data_proxy_file.cc @@ -0,0 +1,188 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "dataproxy_sdk/cc/data_proxy_file.h" + +#include +#include + +#include "arrow/buffer.h" +#include "arrow/flight/api.h" +#include "dataproxy_sdk/cc/data_proxy_conn.h" +#include "dataproxy_sdk/cc/exception.h" +#include "dataproxy_sdk/cc/file_help.h" +#include "dataproxy_sdk/cc/utils.h" +#include "spdlog/spdlog.h" + +namespace dataproxy_sdk { + +class DataProxyFile::Impl { + public: + void Init(const proto::DataProxyConfig &config) { + arrow::flight::FlightClientOptions options = + arrow::flight::FlightClientOptions::Defaults(); + if (config.has_tls_config()) { + options.private_key = + ReadFileContent(config.tls_config().private_key_path()); + options.cert_chain = + ReadFileContent(config.tls_config().certificate_path()); + options.tls_root_certs = + ReadFileContent(config.tls_config().ca_file_path()); + } + + dp_conn_ = DataProxyConn::Connect(config.data_proxy_addr(), + config.has_tls_config(), options); + } + + void DownloadFile(const proto::DownloadInfo &info, + const std::string &file_path, + proto::FileFormat file_format) { + // 1. 从dm获取dp信息 + auto any = BuildDownloadAny(info, file_format); + + // 2. 连接dp + auto descriptor = + arrow::flight::FlightDescriptor::Command(any.SerializeAsString()); + auto stream_reader = dp_conn_->DoGet(descriptor); + // 4. 从读取流下载数据 + + std::unique_ptr file_write = + FileHelpWrite::Make(file_format, file_path); + while (true) { + auto record_batch = stream_reader->ReadRecordBatch(); + if (record_batch == nullptr) { + // read finished + break; + } + file_write->DoWrite(record_batch); + } + + file_write->DoClose(); + } + + FileHelpRead::Options BuildReadOptions(const proto::UploadInfo &info) { + FileHelpRead::Options options = FileHelpRead::Options::Defaults(); + for (auto &column : info.columns()) { + options.column_types.emplace(column.name(), GetDataType(column.type())); + } + return options; + } + + void DoUpload(const proto::UploadInfo &info, const std::string &file_path, + proto::FileFormat file_format) { + // 2. 通过dm返回的dp信息连接dp + auto any = BuildUploadAny(info, file_format); + + auto descriptor = + arrow::flight::FlightDescriptor::Command(any.SerializeAsString()); + // 3. 打开文件读取流 + auto read_options = BuildReadOptions(info); + std::unique_ptr file_read = + FileHelpRead::Make(file_format, file_path, read_options); + + auto put_result = dp_conn_->DoPut(descriptor, file_read->Schema()); + + // 5. 向写入流写入文件数据 + while (true) { + std::shared_ptr batch; + file_read->DoRead(&batch); + if (batch.get() == nullptr) { + break; + } + put_result->WriteRecordBatch(*batch); + } + + put_result->Close(); + file_read->DoClose(); + } + + void CreateDomainData(proto::UploadInfo &info, + proto::FileFormat file_format) { + auto action_msg = BuildActionCreateDomainDataRequest(info, file_format); + arrow::flight::Action action{ + "ActionCreateDomainDataRequest", + arrow::Buffer::FromString(action_msg.SerializeAsString())}; + auto result_stream = dp_conn_->DoAction(action); + + std::unique_ptr result; + ASSIGN_ARROW_OR_THROW(result, result_stream->Next()); + + auto response = GetActionCreateDomainDataResponse(result->body->ToString()); + CHECK_RESP_OR_THROW(response); + if (info.domaindata_id().empty()) { + info.set_domaindata_id(response.data().domaindata_id()); + } else if (response.data().domaindata_id() != info.domaindata_id()) { + DATAPROXY_THROW("domaindata id error, request:{}, response:{}", + info.domaindata_id(), response.data().domaindata_id()); + } + } + + void DeleteDomainData(const proto::UploadInfo &info) { + auto action_request = BuildActionDeleteDomainDataRequest(info); + arrow::flight::Action action{ + "ActionDeleteDomainDataRequest", + arrow::Buffer::FromString(action_request.SerializeAsString())}; + auto result = dp_conn_->DoAction(action); + } + + void UploadFile(proto::UploadInfo &info, const std::string &file_path, + proto::FileFormat file_format) { + dataproxy_sdk::CheckUploadInfo(info); + CreateDomainData(info, file_format); + try { + DoUpload(info, file_path, file_format); + } catch (...) { + try { + DeleteDomainData(info); + } catch (const std::exception &e) { + SPDLOG_WARN("DeleteDomainData error. msg:{}", e.what()); + } + throw; + } + } + + void Close() { dp_conn_->Close(); } + + private: + std::unique_ptr dp_conn_; +}; + +std::unique_ptr DataProxyFile::Make( + const proto::DataProxyConfig &config) { + std::unique_ptr ret = std::make_unique(); + ret->impl_->Init(config); + return ret; +} + +DataProxyFile::DataProxyFile() { + impl_ = std::make_unique(); +} + +DataProxyFile::~DataProxyFile() = default; + +void DataProxyFile::DownloadFile(const proto::DownloadInfo &info, + const std::string &file_path, + proto::FileFormat file_format) { + impl_->DownloadFile(info, file_path, file_format); +} + +void DataProxyFile::UploadFile(proto::UploadInfo &info, + const std::string &file_path, + proto::FileFormat file_format) { + impl_->UploadFile(info, file_path, file_format); +} + +void DataProxyFile::Close() { impl_->Close(); } + +} // namespace dataproxy_sdk diff --git a/dataproxy_sdk/cc/data_proxy_file.h b/dataproxy_sdk/cc/data_proxy_file.h new file mode 100644 index 0000000..7b1f02b --- /dev/null +++ b/dataproxy_sdk/cc/data_proxy_file.h @@ -0,0 +1,48 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "dataproxy_sdk/cc/data_proxy_pb.h" + +namespace dataproxy_sdk { + +class DataProxyFile { + public: + static std::unique_ptr Make( + const proto::DataProxyConfig& config); + + public: + DataProxyFile(); + ~DataProxyFile(); + + public: + void DownloadFile(const proto::DownloadInfo& info, + const std::string& file_path, + proto::FileFormat file_format); + + void UploadFile(proto::UploadInfo& info, const std::string& file_path, + proto::FileFormat file_format); + + void Close(); + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/data_proxy_pb.cc b/dataproxy_sdk/cc/data_proxy_pb.cc new file mode 100644 index 0000000..48bfe0b --- /dev/null +++ b/dataproxy_sdk/cc/data_proxy_pb.cc @@ -0,0 +1,117 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "dataproxy_sdk/cc/data_proxy_pb.h" + +#include "dataproxy_sdk/cc/exception.h" + +namespace dataproxy_sdk { + +inline proto::ContentType FormatToContentType(proto::FileFormat format) { + switch (format) { + case proto::FileFormat::BINARY: + return proto::ContentType::RAW; + case proto::FileFormat::CSV: + case proto::FileFormat::ORC: + return proto::ContentType::CSV; + default: + DATAPROXY_THROW("do not support this type of format:{}", + proto::FileFormat_Name(format)); + } +} + +inline kuscia_proto::FileFormat ChangeToKusciaFileFormat( + proto::FileFormat format) { + switch (format) { + case proto::FileFormat::BINARY: + return kuscia_proto::FileFormat::BINARY; + case proto::FileFormat::CSV: + case proto::FileFormat::ORC: + return kuscia_proto::FileFormat::CSV; + default: + DATAPROXY_THROW("do not support this type of format:{}", + proto::FileFormat_Name(format)); + } +} + +google::protobuf::Any BuildDownloadAny(const proto::DownloadInfo& info, + proto::FileFormat file_format) { + google::protobuf::Any any; + proto::CommandDomainDataQuery msg; + msg.set_domaindata_id(info.domaindata_id()); + // 需要更新kuscia版本 + // msg.set_partition_spec(info.partition_spec()); + msg.set_content_type(FormatToContentType(file_format)); + + any.PackFrom(msg); + return any; +} + +google::protobuf::Any BuildUploadAny(const proto::UploadInfo& info, + proto::FileFormat file_format) { + google::protobuf::Any any; + proto::CommandDomainDataUpdate msg; + msg.set_domaindata_id(info.domaindata_id()); + msg.set_content_type(FormatToContentType(file_format)); + + any.PackFrom(msg); + return any; +} + +proto::CreateDomainDataRequest BuildActionCreateDomainDataRequest( + const proto::UploadInfo& info, proto::FileFormat file_format) { + proto::CreateDomainDataRequest msg; + msg.set_domaindata_id(info.domaindata_id()); + msg.set_name(info.name()); + msg.set_type(info.type()); + msg.set_datasource_id(info.datasource_id()); + msg.set_relative_uri(info.relative_uri()); + for (auto& attribute : info.attributes()) { + msg.mutable_attributes()->insert(attribute); + } + msg.mutable_columns()->CopyFrom(info.columns()); + msg.set_vendor(info.vendor()); + msg.set_file_format(ChangeToKusciaFileFormat(file_format)); + + return msg; +} + +proto::DeleteDomainDataRequest BuildActionDeleteDomainDataRequest( + const proto::UploadInfo& info) { + proto::DeleteDomainDataRequest msg; + msg.set_domaindata_id(info.domaindata_id()); + return msg; +} + +proto::CreateDomainDataResponse GetActionCreateDomainDataResponse( + const std::string& msg) { + proto::CreateDomainDataResponse response; + response.ParseFromString(msg); + return response; +} + +void CheckUploadInfo(const proto::UploadInfo& info) { + // Enum: table,model,rule,report,unknown + if (info.type() != "table" && info.type() != "model" && + info.type() != "rule" && info.type() != "report") { + DATAPROXY_THROW("type[{}] not support in UploadInfo!", info.type()); + } + + if (info.type() == "table" && info.columns().empty()) { + DATAPROXY_THROW( + "when type is table, columns cannot be empty in UploadInfo!"); + } +} + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/data_proxy_pb.h b/dataproxy_sdk/cc/data_proxy_pb.h new file mode 100644 index 0000000..1d13f93 --- /dev/null +++ b/dataproxy_sdk/cc/data_proxy_pb.h @@ -0,0 +1,47 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "dataproxy_sdk/proto/data_proxy_pb.pb.h" +#include "kuscia/proto/api/v1alpha1/datamesh/flightdm.pb.h" + +namespace dataproxy_sdk { + +namespace proto { +// using namespace kuscia::proto::api::v1alpha1; +using namespace kuscia::proto::api::v1alpha1::datamesh; +} // namespace proto + +namespace dm_proto = kuscia::proto::api::v1alpha1::datamesh; +namespace kuscia_proto = kuscia::proto::api::v1alpha1; + +google::protobuf::Any BuildDownloadAny(const proto::DownloadInfo& info, + proto::FileFormat file_format); + +google::protobuf::Any BuildUploadAny(const proto::UploadInfo& info, + proto::FileFormat file_format); + +proto::CreateDomainDataRequest BuildActionCreateDomainDataRequest( + const proto::UploadInfo& info, proto::FileFormat file_format); + +proto::DeleteDomainDataRequest BuildActionDeleteDomainDataRequest( + const proto::UploadInfo& info); + +proto::CreateDomainDataResponse GetActionCreateDomainDataResponse( + const std::string& msg); + +void CheckUploadInfo(const proto::UploadInfo& info); + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/exception.h b/dataproxy_sdk/cc/exception.h new file mode 100644 index 0000000..fefa1de --- /dev/null +++ b/dataproxy_sdk/cc/exception.h @@ -0,0 +1,53 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "yacl/base/exception.h" + +namespace dataproxy_sdk { + +#define DATAPROXY_THROW(...) YACL_THROW_WITH_STACK(__VA_ARGS__) + +#define DATAPROXY_ENFORCE(...) YACL_ENFORCE(__VA_ARGS__) + +#define DATAPROXY_ENFORCE_EQ(...) YACL_ENFORCE_EQ(__VA_ARGS__) + +#define CHECK_ARROW_OR_THROW(statement) \ + do { \ + auto __s__ = (statement); \ + if (!__s__.ok()) { \ + DATAPROXY_THROW(__s__.ToString()); \ + } \ + } while (false) + +#define CHECK_RESP_OR_THROW(resp) \ + do { \ + auto __s__ = (resp).status(); \ + if (__s__.code()) { \ + DATAPROXY_THROW("{}", __s__.message()); \ + } \ + } while (false) + +// For StatusOr from Asylo. +#define ASSIGN_ARROW_OR_THROW(lhs, rexpr) \ + do { \ + auto __s__ = (rexpr); \ + if (!__s__.ok()) { \ + DATAPROXY_THROW(__s__.status().message()); \ + } \ + lhs = std::move(__s__).ValueOrDie(); \ + } while (false) + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/file_help.cc b/dataproxy_sdk/cc/file_help.cc new file mode 100644 index 0000000..10d30a3 --- /dev/null +++ b/dataproxy_sdk/cc/file_help.cc @@ -0,0 +1,275 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "dataproxy_sdk/cc/file_help.h" + +#include + +#include "arrow/adapters/orc/adapter.h" +#include "arrow/builder.h" +#include "arrow/csv/api.h" +#include "arrow/io/api.h" +#include "dataproxy_sdk/cc/exception.h" +#include "file_help.h" + +namespace dataproxy_sdk { + +class BinaryFileWrite : public FileHelpWrite { + public: + void DoWrite(std::shared_ptr& record_batch) { + DATAPROXY_ENFORCE_EQ(record_batch->num_columns(), 1); + + auto binary_array = + std::dynamic_pointer_cast(record_batch->column(0)); + CHECK_ARROW_OR_THROW(out_stream_->Write( + binary_array->raw_data(), binary_array->total_values_length())); + } + void DoClose() { CHECK_ARROW_OR_THROW(out_stream_->Close()); } + + protected: + void DoOpen(const std::string& file_name) { + ASSIGN_ARROW_OR_THROW(out_stream_, + arrow::io::FileOutputStream::Open(file_name)); + } + + private: + std::shared_ptr out_stream_; +}; + +class CSVFileWrite : public FileHelpWrite { + public: + void DoWrite(std::shared_ptr& record_batch) { + CHECK_ARROW_OR_THROW(arrow::csv::WriteCSV( + *record_batch, arrow::csv::WriteOptions::Defaults(), + out_stream_.get())); + } + void DoClose() { CHECK_ARROW_OR_THROW(out_stream_->Close()); } + + protected: + void DoOpen(const std::string& file_name) { + ASSIGN_ARROW_OR_THROW(out_stream_, + arrow::io::FileOutputStream::Open(file_name)); + } + + private: + std::shared_ptr out_stream_; +}; + +class ORCFileWrite : public FileHelpWrite { + public: + void DoWrite(std::shared_ptr& record_batch) { + CHECK_ARROW_OR_THROW(orc_writer_->Write(*record_batch)); + } + void DoClose() { + CHECK_ARROW_OR_THROW(orc_writer_->Close()); + CHECK_ARROW_OR_THROW(out_stream_->Close()); + }; + + protected: + void DoOpen(const std::string& file_name) { + ASSIGN_ARROW_OR_THROW(out_stream_, + arrow::io::FileOutputStream::Open(file_name)); + ASSIGN_ARROW_OR_THROW( + orc_writer_, + arrow::adapters::orc::ORCFileWriter::Open(out_stream_.get())); + } + + private: + std::unique_ptr orc_writer_; + std::shared_ptr out_stream_; +}; + +std::unique_ptr FileHelpWrite::Make( + proto::FileFormat file_format, const std::string& file_name) { + std::unique_ptr ret; + switch (file_format) { + case proto::FileFormat::CSV: + ret = std::make_unique(); + break; + case proto::FileFormat::BINARY: + ret = std::make_unique(); + break; + case proto::FileFormat::ORC: + ret = std::make_unique(); + break; + default: + DATAPROXY_THROW("format[{}] not support.", + proto::FileFormat_Name(file_format)); + break; + } + ret->DoOpen(file_name); + return ret; +} + +class BinaryFileRead : public FileHelpRead { + public: + BinaryFileRead(FileHelpRead::Options options) : FileHelpRead(options) {} + ~BinaryFileRead() = default; + + private: + const int64_t kReadBytesLen = 128 * 1024; + const int64_t kchunksNum = 8; + + public: + static std::shared_ptr kBinaryFileSchema; + + public: + void DoRead(std::shared_ptr* record_batch) { + arrow::BinaryBuilder binary_build; + for (int i = 0; i < kchunksNum; ++i) { + std::shared_ptr buffer; + ASSIGN_ARROW_OR_THROW(buffer, read_stream_->Read(kReadBytesLen)); + CHECK_ARROW_OR_THROW(binary_build.Append(buffer->data(), buffer->size())); + if (buffer->size() < kReadBytesLen) break; + } + + if (binary_build.value_data_length() > 0) { + std::vector> arrays(1); + CHECK_ARROW_OR_THROW(binary_build.Finish(&arrays[0])); + *record_batch = + arrow::RecordBatch::Make(this->Schema(), arrays.size(), arrays); + } + } + void DoClose() { CHECK_ARROW_OR_THROW(read_stream_->Close()); } + std::shared_ptr Schema() { return kBinaryFileSchema; } + + protected: + void DoOpen(const std::string& file_name) { + std::shared_ptr file_stream; + ASSIGN_ARROW_OR_THROW(file_stream, + arrow::io::ReadableFile::Open(file_name)); + int64_t file_total_size = 0; + ASSIGN_ARROW_OR_THROW(file_total_size, file_stream->GetSize()); + ASSIGN_ARROW_OR_THROW(read_stream_, arrow::io::RandomAccessFile::GetStream( + file_stream, 0, file_total_size)); + } + + private: + std::shared_ptr read_stream_; +}; + +std::shared_ptr BinaryFileRead::kBinaryFileSchema = + arrow::schema({arrow::field("binary_data", arrow::binary())}); + +class CSVFileRead : public FileHelpRead { + public: + CSVFileRead(FileHelpRead::Options options) + : FileHelpRead(options), + convert_options_(arrow::csv::ConvertOptions::Defaults()) { + for (auto& pair : options.column_types) { + convert_options_.column_types.emplace(pair.first, pair.second); + convert_options_.include_columns.push_back(pair.first); + } + } + ~CSVFileRead() = default; + + public: + void DoRead(std::shared_ptr* record_batch) { + CHECK_ARROW_OR_THROW(file_reader_->ReadNext(record_batch)); + } + void DoClose() { CHECK_ARROW_OR_THROW(file_reader_->Close()); } + std::shared_ptr Schema() { return file_reader_->schema(); } + + protected: + void DoOpen(const std::string& file_name) { + std::shared_ptr file_stream; + ASSIGN_ARROW_OR_THROW(file_stream, + arrow::io::ReadableFile::Open(file_name)); + ASSIGN_ARROW_OR_THROW( + file_reader_, + arrow::csv::StreamingReader::Make( + arrow::io::default_io_context(), file_stream, + arrow::csv::ReadOptions::Defaults(), + arrow::csv::ParseOptions::Defaults(), convert_options_)); + } + + private: + std::shared_ptr file_reader_; + arrow::csv::ConvertOptions convert_options_; +}; + +class ORCFileRead : public FileHelpRead { + public: + ORCFileRead(FileHelpRead::Options options) + : FileHelpRead(options), current_stripe_(0) { + for (auto& pair : options.column_types) { + include_names_.push_back(pair.first); + } + } + ~ORCFileRead() = default; + + public: + void DoRead(std::shared_ptr* record_batch) { + if (current_stripe_ >= orc_reader_->NumberOfStripes()) return; + if (include_names_.empty()) { + ASSIGN_ARROW_OR_THROW(*record_batch, + orc_reader_->ReadStripe(current_stripe_)); + } else { + ASSIGN_ARROW_OR_THROW( + *record_batch, + orc_reader_->ReadStripe(current_stripe_, include_names_)); + } + ++current_stripe_; + } + void DoClose() { CHECK_ARROW_OR_THROW(file_stream_->Close()); } + std::shared_ptr Schema() { + std::shared_ptr ret; + ASSIGN_ARROW_OR_THROW(ret, orc_reader_->ReadSchema()); + return ret; + } + + protected: + void DoOpen(const std::string& file_name) { + ASSIGN_ARROW_OR_THROW(file_stream_, + arrow::io::ReadableFile::Open(file_name)); + ASSIGN_ARROW_OR_THROW(orc_reader_, + arrow::adapters::orc::ORCFileReader::Open( + file_stream_, arrow::default_memory_pool())); + } + + private: + int64_t current_stripe_; + std::unique_ptr orc_reader_; + std::shared_ptr file_stream_; + std::vector include_names_; +}; + +std::unique_ptr FileHelpRead::Make( + proto::FileFormat file_format, const std::string& file_name, + const FileHelpRead::Options& options) { + std::unique_ptr ret; + switch (file_format) { + case proto::FileFormat::CSV: + ret = std::make_unique(options); + break; + case proto::FileFormat::BINARY: + ret = std::make_unique(options); + break; + case proto::FileFormat::ORC: + ret = std::make_unique(options); + break; + default: + DATAPROXY_THROW("format[{}] not support.", + proto::FileFormat_Name(file_format)); + break; + } + ret->DoOpen(file_name); + return ret; +} + +FileHelpRead::Options FileHelpRead::Options::Defaults() { + return FileHelpRead::Options(); +} + +} // namespace dataproxy_sdk diff --git a/dataproxy_sdk/cc/file_help.h b/dataproxy_sdk/cc/file_help.h new file mode 100644 index 0000000..63e0d85 --- /dev/null +++ b/dataproxy_sdk/cc/file_help.h @@ -0,0 +1,77 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "arrow/type.h" +#include "dataproxy_sdk/cc/data_proxy_pb.h" + +namespace dataproxy_sdk { + +class FileHelpBase { + public: + FileHelpBase() = default; + virtual ~FileHelpBase() = default; + + public: + virtual void DoClose() = 0; + + protected: + virtual void DoOpen(const std::string& file_name) = 0; +}; + +class FileHelpWrite : public FileHelpBase { + public: + static std::unique_ptr Make(proto::FileFormat file_format, + const std::string& file_name); + + public: + FileHelpWrite() = default; + virtual ~FileHelpWrite() = default; + + public: + virtual void DoWrite(std::shared_ptr& record_batch) = 0; +}; + +class FileHelpRead : public FileHelpBase { + public: + struct Options { + std::unordered_map> + column_types; + + static Options Defaults(); + }; + + public: + static std::unique_ptr Make(proto::FileFormat file_format, + const std::string& file_name, + const Options& options); + static std::unique_ptr Make(proto::FileFormat file_format, + const std::string& file_name) { + return Make(file_format, file_name, Options::Defaults()); + } + + public: + explicit FileHelpRead(const Options& options){}; + virtual ~FileHelpRead() = default; + + public: + virtual std::shared_ptr Schema() = 0; + virtual void DoRead(std::shared_ptr* record_batch) = 0; +}; + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/file_help_test.cc b/dataproxy_sdk/cc/file_help_test.cc new file mode 100644 index 0000000..144014c --- /dev/null +++ b/dataproxy_sdk/cc/file_help_test.cc @@ -0,0 +1,301 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "dataproxy_sdk/cc/file_help.h" + +#include +#include +#include +#include +#include + +#include "arrow/adapters/orc/adapter.h" +#include "arrow/builder.h" +#include "arrow/csv/api.h" +#include "arrow/io/api.h" +#include "dataproxy_sdk/cc/exception.h" +#include "gtest/gtest.h" + +namespace dataproxy_sdk { + +class RandomBatchGenerator { + public: + std::shared_ptr schema; + RandomBatchGenerator(std::shared_ptr schema) + : schema(schema){}; + + static std::shared_ptr Generate( + std::shared_ptr schema, int32_t num_rows) { + RandomBatchGenerator generator(schema); + + std::shared_ptr batch; + ASSIGN_ARROW_OR_THROW(batch, generator.Generate(num_rows)); + return batch; + } + + arrow::Result> Generate( + int32_t num_rows) { + num_rows_ = num_rows; + for (std::shared_ptr field : schema->fields()) { + ARROW_RETURN_NOT_OK(arrow::VisitTypeInline(*field->type(), this)); + } + return arrow::RecordBatch::Make(schema, num_rows, arrays_); + } + + // Default implementation + arrow::Status Visit(const arrow::DataType &type) { + return arrow::Status::NotImplemented("Generating data for", + type.ToString()); + } + + arrow::Status Visit(const arrow::BinaryType &) { + auto builder = arrow::BinaryBuilder(); + // std::normal_distribution<> d{ + // /*mean=*/0x05, + // }; // 正态分布 + for (int32_t i = 0; i < num_rows_; ++i) { + ARROW_RETURN_NOT_OK(builder.Append("03", 2)); + } + + ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish()); + arrays_.push_back(array); + return arrow::Status::OK(); + } + + arrow::Status Visit(const arrow::DoubleType &) { + auto builder = arrow::DoubleBuilder(); + std::normal_distribution<> d{/*mean=*/5.0, /*stddev=*/2.0}; // 正态分布 + for (int32_t i = 0; i < num_rows_; ++i) { + ARROW_RETURN_NOT_OK(builder.Append(d(gen_))); + } + + ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish()); + arrays_.push_back(array); + return arrow::Status::OK(); + } + + arrow::Status Visit(const arrow::Int64Type &) { + // Generate offsets first, which determines number of values in sub-array + std::poisson_distribution<> d{ + /*mean=*/4}; // 产生随机非负整数值i,按离散概率函数分布 + auto builder = arrow::Int64Builder(); + for (int32_t i = 0; i < num_rows_; ++i) { + ARROW_RETURN_NOT_OK(builder.Append(d(gen_))); + } + + ARROW_ASSIGN_OR_RAISE(auto array, builder.Finish()); + arrays_.push_back(array); + return arrow::Status::OK(); + } + + protected: + std::random_device rd_{}; + std::mt19937 gen_{rd_()}; // 随机种子 + std::vector> arrays_; + int32_t num_rows_; + +}; // RandomBatchGenerator + +proto::FileFormat GetFormat(const std::string &file) { + if (file.find(".csv") != std::string::npos) + return proto::FileFormat::CSV; + else if (file.find(".orc") != std::string::npos) + return proto::FileFormat::ORC; + + return proto::FileFormat::BINARY; +} + +static std::shared_ptr GetRecordBatch(int data_num = 2) { + static std::shared_ptr gSchema = arrow::schema( + {arrow::field("x", arrow::int64()), arrow::field("y", arrow::int64()), + arrow::field("z", arrow::int64())}); + + return RandomBatchGenerator::Generate(gSchema, data_num); +} + +const std::string kCSVFilePath = "test.csv"; +const std::string kORCFilePath = "test.orc"; +const std::string kBianryFilePath = "test.txt"; + +TEST(FileHelpTest, Binary) { + std::shared_ptr schema = + arrow::schema({arrow::field("binary_data", arrow::binary())}); + + std::shared_ptr batch = + RandomBatchGenerator::Generate(schema, 1); + auto writer = + FileHelpWrite::Make(GetFormat(kBianryFilePath), kBianryFilePath); + writer->DoWrite(batch); + writer->DoClose(); + + std::shared_ptr read_batch; + auto reader = FileHelpRead::Make(GetFormat(kBianryFilePath), kBianryFilePath); + reader->DoRead(&read_batch); + reader->DoClose(); + + std::cout << batch->ToString() << std::endl; + std::cout << read_batch->ToString() << std::endl; + + EXPECT_TRUE(batch->Equals(*read_batch)); +} + +TEST(FileHelpTest, ZeroBinary) { + std::shared_ptr schema = + arrow::schema({arrow::field("binary_data", arrow::binary())}); + + auto binary_builder = arrow::BinaryBuilder(); + CHECK_ARROW_OR_THROW(binary_builder.Append("3\000\00045\0006\000", 8)); + std::shared_ptr array; + ASSIGN_ARROW_OR_THROW(array, binary_builder.Finish()); + std::vector> arrays; + arrays.push_back(array); + + std::shared_ptr batch = + arrow::RecordBatch::Make(schema, arrays.size(), arrays); + auto writer = + FileHelpWrite::Make(GetFormat(kBianryFilePath), kBianryFilePath); + writer->DoWrite(batch); + writer->DoClose(); + + std::shared_ptr read_batch; + auto reader = FileHelpRead::Make(GetFormat(kBianryFilePath), kBianryFilePath); + reader->DoRead(&read_batch); + reader->DoClose(); + + std::cout << batch->ToString() << std::endl; + std::cout << read_batch->ToString() << std::endl; + + EXPECT_TRUE(batch->Equals(*read_batch)); +} + +TEST(FileHelpTest, CSV) { + std::shared_ptr batch = GetRecordBatch(); + + auto writer = FileHelpWrite::Make(GetFormat(kCSVFilePath), kCSVFilePath); + writer->DoWrite(batch); + writer->DoClose(); + + std::shared_ptr read_batch; + auto reader = FileHelpRead::Make(GetFormat(kCSVFilePath), kCSVFilePath); + reader->DoRead(&read_batch); + reader->DoClose(); + + std::cout << batch->ToString() << std::endl; + std::cout << read_batch->ToString() << std::endl; + + EXPECT_TRUE(batch->Equals(*read_batch)); +} + +TEST(FileHelpTest, ORC) { + std::shared_ptr batch = GetRecordBatch(); + + auto writer = FileHelpWrite::Make(GetFormat(kORCFilePath), kORCFilePath); + writer->DoWrite(batch); + writer->DoClose(); + + std::shared_ptr read_batch; + auto reader = FileHelpRead::Make(GetFormat(kORCFilePath), kORCFilePath); + reader->DoRead(&read_batch); + reader->DoClose(); + + std::cout << batch->ToString() << std::endl; + std::cout << read_batch->ToString() << std::endl; + + EXPECT_TRUE(batch->Equals(*read_batch)); +} + +FileHelpRead::Options GetReadOptions() { + FileHelpRead::Options read_options = FileHelpRead::Options::Defaults(); + read_options.column_types.emplace("z", arrow::int64()); + return read_options; +} + +std::vector GetSelectColumns() { + static std::vector select_columns(1, 2); + return select_columns; +} + +TEST(FileHelpTestWithOption, CSV) { + std::shared_ptr batch = GetRecordBatch(); + + auto writer = FileHelpWrite::Make(GetFormat(kCSVFilePath), kCSVFilePath); + writer->DoWrite(batch); + writer->DoClose(); + + std::shared_ptr read_batch; + auto reader = FileHelpRead::Make(GetFormat(kCSVFilePath), kCSVFilePath, + GetReadOptions()); + reader->DoRead(&read_batch); + reader->DoClose(); + + auto target_batch = batch->SelectColumns(GetSelectColumns()).ValueOrDie(); + std::cout << target_batch->ToString() << std::endl; + std::cout << read_batch->ToString() << std::endl; + + EXPECT_TRUE(target_batch->Equals(*read_batch)); +} + +TEST(FileHelpTestWithOption, ORC) { + std::shared_ptr batch = GetRecordBatch(); + + auto writer = FileHelpWrite::Make(GetFormat(kORCFilePath), kORCFilePath); + writer->DoWrite(batch); + writer->DoClose(); + + std::shared_ptr read_batch; + auto reader = FileHelpRead::Make(GetFormat(kORCFilePath), kORCFilePath, + GetReadOptions()); + reader->DoRead(&read_batch); + reader->DoClose(); + + auto target_batch = batch->SelectColumns(GetSelectColumns()).ValueOrDie(); + std::cout << target_batch->ToString() << std::endl; + std::cout << read_batch->ToString() << std::endl; + + EXPECT_TRUE(target_batch->Equals(*read_batch)); +} + +FileHelpRead::Options GetErrorOptions() { + FileHelpRead::Options read_options = FileHelpRead::Options::Defaults(); + read_options.column_types.emplace("a", arrow::int64()); + return read_options; +} + +TEST(FileHelpTestWithOption, ErrorCSV) { + std::shared_ptr batch = GetRecordBatch(); + + auto writer = FileHelpWrite::Make(GetFormat(kCSVFilePath), kCSVFilePath); + writer->DoWrite(batch); + writer->DoClose(); + + std::shared_ptr read_batch; + EXPECT_THROW(FileHelpRead::Make(GetFormat(kCSVFilePath), kCSVFilePath, + GetErrorOptions()), + yacl::Exception); +} + +TEST(FileHelpTestWithOption, ErrorORC) { + std::shared_ptr batch = GetRecordBatch(); + + auto writer = FileHelpWrite::Make(GetFormat(kORCFilePath), kORCFilePath); + writer->DoWrite(batch); + writer->DoClose(); + + std::shared_ptr read_batch; + auto reader = FileHelpRead::Make(GetFormat(kORCFilePath), kORCFilePath, + GetErrorOptions()); + EXPECT_THROW(reader->DoRead(&read_batch), yacl::Exception); +} + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/utils.cc b/dataproxy_sdk/cc/utils.cc new file mode 100644 index 0000000..cba6ee1 --- /dev/null +++ b/dataproxy_sdk/cc/utils.cc @@ -0,0 +1,54 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "dataproxy_sdk/cc/utils.h" + +#include +#include +#include + +#include "dataproxy_sdk/cc/exception.h" + +namespace dataproxy_sdk { + +std::string ReadFileContent(const std::string& file) { + if (!std::filesystem::exists(file)) { + DATAPROXY_ENFORCE("can not find file: {}", file); + } + std::ifstream file_is(file); + DATAPROXY_ENFORCE(file_is.good(), "open failed, file: {}", file); + return std::string((std::istreambuf_iterator(file_is)), + std::istreambuf_iterator()); +} + +std::shared_ptr GetDataType(const std::string& type) { + static std::unordered_map> + type_map = {{"int8", arrow::int8()}, {"int16", arrow::int16()}, + {"int32", arrow::int32()}, {"int64", arrow::int64()}, + {"uint8", arrow::uint8()}, {"uint16", arrow::uint16()}, + {"uint32", arrow::uint32()}, {"uint64", arrow::uint64()}, + {"float16", arrow::float16()}, {"float32", arrow::float32()}, + {"float64", arrow::float64()}, {"bool", arrow::boolean()}, + {"int", arrow::int64()}, {"float", arrow::float64()}, + {"str", arrow::utf8()}, {"string", arrow::utf8()}}; + + auto iter = type_map.find(type); + if (iter == type_map.end()) { + DATAPROXY_THROW("Unsupported type: {}", type); + } + + return iter->second; +} + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/cc/utils.h b/dataproxy_sdk/cc/utils.h new file mode 100644 index 0000000..cf5794a --- /dev/null +++ b/dataproxy_sdk/cc/utils.h @@ -0,0 +1,27 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "arrow/type_fwd.h" + +namespace dataproxy_sdk { + +std::string ReadFileContent(const std::string& file); + +std::shared_ptr GetDataType(const std::string& type); + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/proto/BUILD.bazel b/dataproxy_sdk/proto/BUILD.bazel new file mode 100644 index 0000000..a0ef99e --- /dev/null +++ b/dataproxy_sdk/proto/BUILD.bazel @@ -0,0 +1,40 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@rules_proto//proto:defs.bzl", "proto_library") +load("@rules_proto_grpc//python:defs.bzl", "python_proto_compile") + +package(default_visibility = ["//visibility:public"]) + +proto_library( + name = "data_proxy_proto", + srcs = ["data_proxy_pb.proto"], + deps = [ + "@kuscia//proto/api/v1alpha1:common_proto", + ], +) + +cc_proto_library( + name = "data_proxy_proto_cc", + deps = ["data_proxy_proto"], +) + +python_proto_compile( + name = "data_proxy_proto_py", + output_mode = "NO_PREFIX", + prefix_path = "../..", + protos = [ + ":data_proxy_proto", + ], +) diff --git a/dataproxy_sdk/proto/data_proxy_pb.proto b/dataproxy_sdk/proto/data_proxy_pb.proto new file mode 100644 index 0000000..a43ad97 --- /dev/null +++ b/dataproxy_sdk/proto/data_proxy_pb.proto @@ -0,0 +1,83 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +syntax = "proto3"; + +package dataproxy_sdk.proto; + +import "kuscia/proto/api/v1alpha1/common.proto"; + +message TlSConfig { + // Certificate file path + string certificate_path = 1; + + // Private key file path + string private_key_path = 2; + + // The trusted CA file to verify the peer's certificate + // If empty, use the system default CA files + string ca_file_path = 3; +} + +enum FileFormat { + UNKNOWN = 0; + CSV = 1; + BINARY = 2; + ORC = 3; +} + +message DataProxyConfig { + // Kuscia data proxy address, e.g. 127.0.0.1:8617 + string data_proxy_addr = 1; + + // TLS related config for connect to kuscia data mesh & data proxy + TlSConfig tls_config = 2; +} + +message DownloadInfo { + string domaindata_id = 1; + // specific the partition column and value, such as "dmdt=20240520" + string partition_spec = 2; +} + +message UploadInfo { + // Optional, The domaindata_id would be generated by server if the + // domaindata_id is empty. The unique identity of domaindata, it couldn't + // duplicate in the same domain. + string domaindata_id = 1; + // The human readable, it could duplicate in the domain. + string name = 2; + // Enum: table,model,rule,report,unknown + string type = 3; + // The relative_uri is relative to the datasource URI, The datasourceURI + // appends relative_uri is the domaindataURI. e.g. the relative_uri is + // "train/table.csv" + // the URI of datasource is "/home/data" + // the URI of domaindata is "/home/data/train/table.csv" + string relative_uri = 4; + // Optional, server would use default datasource if datasource_id is empty. + // The datasource is where the domain is stored. + string datasource_id = 5; + // Optional, The attributes of the domaindata, this field use as a extra + // field, User could set this field to any data what they need. + map attributes = 6; + // This field must be set if the type is 'table', + // the columns describe the table's schema information. + repeated kuscia.proto.api.v1alpha1.DataColumn columns = 7; + // Optional , The vendor is the one who outputs the domain data, it may be the + // SecretFlow engine, another vendor's engine, or manually registered. it's + // could be manual, secretflow or other vendor string. + string vendor = 8; +} \ No newline at end of file diff --git a/dataproxy_sdk/python/BUILD.bazel b/dataproxy_sdk/python/BUILD.bazel new file mode 100644 index 0000000..0621a76 --- /dev/null +++ b/dataproxy_sdk/python/BUILD.bazel @@ -0,0 +1,71 @@ +# Copyright 2024 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +load("@pybind11_bazel//:build_defs.bzl", "pybind_extension") +load("@rules_python//python:defs.bzl", "py_library") + +package(default_visibility = ["//visibility:public"]) + +exports_files( + [ + "exported_symbols.lds", + "version_script.lds", + ], + visibility = ["//visibility:private"], +) + +pybind_extension( + name = "libdataproxy", + srcs = ["libdataproxy.cc"], + linkopts = select({ + "@bazel_tools//src/conditions:darwin": [ + "-Wl,-exported_symbols_list,$(location //dataproxy_sdk/python:exported_symbols.lds)", + ], + "//conditions:default": [ + "-Wl,--version-script,$(location //dataproxy_sdk/python:version_script.lds)", + ], + }), + deps = [ + ":exported_symbols.lds", + ":version_script.lds", + "//dataproxy_sdk/cc:dataproxy_sdk_cc", + ], +) + +py_library( + name = "data_proxy_file_py", + srcs = [ + "dp_file_adapter.py", + ], + data = [ + ":libdataproxy.so", + ], +) + +py_library( + name = "protos", + srcs = [ + "dp_pb2.py", + "//dataproxy_sdk/proto:data_proxy_proto_py", + ], +) + +py_library( + name = "init", + srcs = [ + "__init__.py", + ":data_proxy_file_py", + ":protos", + ], +) diff --git a/dataproxy_sdk/python/__init__.py b/dataproxy_sdk/python/__init__.py new file mode 100644 index 0000000..46f09c2 --- /dev/null +++ b/dataproxy_sdk/python/__init__.py @@ -0,0 +1,7 @@ +from . import dp_pb2 as proto +from .dp_file_adapter import * + +__all__ = [ + "DataProxyFileAdapter", + "proto", +] \ No newline at end of file diff --git a/dataproxy_sdk/python/dp_file_adapter.py b/dataproxy_sdk/python/dp_file_adapter.py new file mode 100644 index 0000000..38a79b5 --- /dev/null +++ b/dataproxy_sdk/python/dp_file_adapter.py @@ -0,0 +1,15 @@ +from . import libdataproxy +from . import dp_pb2 as proto + +class DataProxyFileAdapter: + def __init__(self, config: proto.DataProxyConfig): + self.data_proxy_file = libdataproxy.DataProxyFile(config.SerializeToString()) + + def close(self): + self.data_proxy_file.close() + + def download_file(self, info: proto.DownloadInfo, file_path: str, file_format: proto.FileFormat): + self.data_proxy_file.download_file(info.SerializeToString(), file_path, file_format) + + def upload_file(self, info: proto.UploadInfo, file_path: str, file_format: proto.FileFormat): + self.data_proxy_file.upload_file(info.SerializeToString(), file_path, file_format) \ No newline at end of file diff --git a/dataproxy_sdk/python/dp_pb2.py b/dataproxy_sdk/python/dp_pb2.py new file mode 100644 index 0000000..7b6470e --- /dev/null +++ b/dataproxy_sdk/python/dp_pb2.py @@ -0,0 +1 @@ +from dataproxy_sdk.proto.data_proxy_pb_pb2 import * \ No newline at end of file diff --git a/dataproxy_sdk/python/exported_symbols.lds b/dataproxy_sdk/python/exported_symbols.lds new file mode 100644 index 0000000..2637585 --- /dev/null +++ b/dataproxy_sdk/python/exported_symbols.lds @@ -0,0 +1 @@ +_PyInit_* \ No newline at end of file diff --git a/dataproxy_sdk/python/libdataproxy.cc b/dataproxy_sdk/python/libdataproxy.cc new file mode 100644 index 0000000..167a86d --- /dev/null +++ b/dataproxy_sdk/python/libdataproxy.cc @@ -0,0 +1,69 @@ +// Copyright 2024 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "dataproxy_sdk/cc/api.h" +#include "dataproxy_sdk/cc/exception.h" +#include "pybind11/pybind11.h" + +namespace py = pybind11; + +namespace dataproxy_sdk { + +PYBIND11_MODULE(libdataproxy, m) { + m.doc() = R"pbdoc( + Secretflow-DataProxy-SDK Python Library + )pbdoc"; + + py::register_exception_translator( + [](std::exception_ptr p) { // NOLINT: pybind11 + try { + if (p) { + std::rethrow_exception(p); + } + } catch (const yacl::Exception& e) { + // Translate this exception to a standard RuntimeError + PyErr_SetString(PyExc_RuntimeError, + fmt::format("what: \n\t{}\n", e.what()).c_str()); + } + }); + + py::class_>(m, "DataProxyFile") + .def(py::init( + [](const py::bytes& config_str) -> std::unique_ptr { + proto::DataProxyConfig config; + config.ParseFromString(config_str); + return DataProxyFile::Make(config); + })) + .def("download_file", + [](DataProxyFile& self, const py::bytes& info_str, + const std::string& file_path, int file_format) { + proto::DownloadInfo info; + info.ParseFromString(info_str); + + self.DownloadFile(info, file_path, + static_cast(file_format)); + }) + .def("upload_file", + [](DataProxyFile& self, const py::bytes& info_str, + const std::string& file_path, int file_format) { + proto::UploadInfo info; + info.ParseFromString(info_str); + + self.UploadFile(info, file_path, + static_cast(file_format)); + }) + .def("close", &DataProxyFile::Close); +} + +} // namespace dataproxy_sdk \ No newline at end of file diff --git a/dataproxy_sdk/python/version_script.lds b/dataproxy_sdk/python/version_script.lds new file mode 100644 index 0000000..a7e3bc0 --- /dev/null +++ b/dataproxy_sdk/python/version_script.lds @@ -0,0 +1,9 @@ +VERS_1.0 { + # Export symbols in pybind. + global: + PyInit_*; + + # Hide everything else. + local: + *; +}; diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..ea55e5d --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,147 @@ +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + +import os.path + +# -- Project information ----------------------------------------------------- + +project = 'DataProxy' +copyright = '2023 Ant Group Co., Ltd.' +author = 'DataProxy authors' + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.napoleon', + 'sphinx.ext.autodoc', + 'sphinx.ext.graphviz', + 'sphinx.ext.todo', + 'sphinx.ext.viewcode', + 'sphinx.ext.extlinks', + 'sphinx.ext.autosectionlabel', + 'myst_parser', + "nbsphinx", + 'sphinxcontrib.actdiag', + 'sphinxcontrib.blockdiag', + 'sphinxcontrib.nwdiag', + 'sphinxcontrib.packetdiag', + 'sphinxcontrib.rackdiag', + 'sphinxcontrib.seqdiag', + 'sphinx_design', +] + +nbsphinx_requirejs_path = '' + +# Make sure the target is unique +autosectionlabel_prefix_document = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# Enable TODO +todo_include_todos = True + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'pydata_sphinx_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +html_favicon = '_static/favicon.ico' + +html_css_files = [ + 'css/custom.css', +] + +html_js_files = ['js/custom.js'] + +html_theme_options = { + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/secretflow/dataproxy", + "icon": "fab fa-github-square", + "type": "fontawesome", + }, + ], + "logo": { + "text": "DataProxy", + }, +} + +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "fieldlist", + "html_admonition", + "html_image", + "linkify", + "replacements", + "smartquotes", + "strikethrough", + "substitution", + "tasklist", + "attrs_inline", + "attrs_block", +] + +suppress_warnings = ["myst.header"] + +myst_gfm_only = False +myst_heading_anchors = 1 +myst_title_to_header = True + + +# app setup hook +def setup(app): + app.add_config_value( + 'recommonmark_config', + { + 'auto_toc_tree_section': 'Contents', + }, + True, + ) diff --git a/docs/development/build_dataproxy_cn.md b/docs/development/build_dataproxy_cn.md new file mode 100644 index 0000000..c41792d --- /dev/null +++ b/docs/development/build_dataproxy_cn.md @@ -0,0 +1,53 @@ +# 构建命令 + +## 开发环境搭建 + +### 开发环境依赖 + +* JDK: 17 +* Maven: 3.5+ +* Docker + +## 构建 DataProxy + +DataProxy 提供了 Makefile 来构建项目,你可以通过`make help`命令查看命令帮助,其中 Development 部分提供了构建能力: + +```shell +Usage: + make + +General + help Display this help. + +Development + test Run tests. + build Build dataproxy binary. + image Build docker image with the manager. + docs Build docs. +``` + +### 测试 + +在 DataProxy 项目根目录下: + +执行`make test`命令,该命令将会执行项目中所有的测试 + +### 构建可执行JAR文件 + +在 DataProxy 项目根目录下: + +执行`make build`命令,该命令将会构建出 DataProxy 的可执行JAR,构建产物会生成在 ./target/ 目录下。 + +### 构建 DataProxy Image + +在 DataProxy 项目根目录下: + +执行`make image`命令,该命令将会使用 Docker 命令构建出 DataProxy 镜像。目前 DataProxy 暂时仅支持构建 linux/amd64 的 Anolis 镜像。 + +### 编译文档 + +在 DataProxy 项目根目录下: + +执行`make docs`命令,该命令会生成 DataProxy 文档,生成的文档会放在 `docs/_build/html` 目录,用浏览器打开 `docs/_build/html/index.html` 就可以查看文档。 + +该命令依赖于 python 环境,并且已经安装了 pip 工具;编译文档前请提前安装,否则会执行错误。 \ No newline at end of file diff --git a/docs/development/index.rst b/docs/development/index.rst new file mode 100644 index 0000000..1fcc0b3 --- /dev/null +++ b/docs/development/index.rst @@ -0,0 +1,8 @@ +开发 +===== + + +.. toctree:: + :maxdepth: 2 + + build_dataproxy_cn \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..889ee54 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,22 @@ +.. DataProxy documentation master file, created by + sphinx-quickstart on Fri Aug 10 10:10:10 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + + +DataProxy +================== + +DataProxy 是一个基于 [Arrow Flight](https://arrow.apache.org/docs/format/Flight.html) 的数据服务框架,接入丰富的数据源,提供统一、易用、高效、健壮的数据读写服务。通过 DataProxy: + +* 你可以接入丰富的数据源,其中包括 MySQL、S3、Aliyun OSS、本地磁盘等 +* 你可以使用统一的接口来实现对不同数据源的读写操作 + +内容 +------- + +.. toctree:: + :maxdepth: 2 + + development/index + deployment_experience/index diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..0d5df12 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,14 @@ +myst-parser==2.0.0 +rstcheck==6.1.2 +sphinx==6.2.1 +nbsphinx==0.9.2 +sphinx-autobuild==2021.3.14 +sphinx-markdown-parser==0.2.4 +sphinxcontrib-actdiag==3.0.0 +sphinxcontrib-blockdiag==3.0.0 +sphinxcontrib-nwdiag==2.0.0 +sphinxcontrib-seqdiag==3.0.0 +pytablewriter==0.64.2 +linkify-it-py==2.0.2 +sphinx_design==0.4.1 +pydata-sphinx-theme==0.13.3 \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..225327f --- /dev/null +++ b/pom.xml @@ -0,0 +1,493 @@ + + + + 4.0.0 + pom + + dataproxy-common + dataproxy-api + dataproxy-manager + dataproxy-service + dataproxy-server + dataproxy-integration-tests + + + org.springframework.boot + spring-boot-starter-parent + 3.1.12 + + + org.secretflow + dataproxy + 0.0.1-SNAPSHOT + + dataproxy + dataproxy parent + + + 17 + 17 + 17 + 1.18.24 + 3.22.5 + 1.62.2 + 32.1.1-jre + 2.17.2 + 1.3.2 + 1.7 + 1.4.14 + 3.12.0 + 4.4 + 1.26.2 + 4.0.3 + 1.3.1 + 3.4.0 + 14.0.0 + 2.0 + 1.9.6 + 2.0.1.Final + 6.2.3.Final + 4.1.101.Final + 4.1.101.Final + 2.0.61.Final + 5.4 + 2.5.5 + + 8.2.0 + 3.3.6 + 1.5.4 + + 0.48.6-public + + 1.1.5 + 1.0 + 1.6.4 + 3.3.13 + + 1.7.1 + 0.6.1 + 3.3.1 + UTF-8 + + + + + + org.secretflow + dataproxy-api + ${project.version} + + + org.secretflow + dataproxy-common + ${project.version} + + + org.secretflow + dataproxy-integration-tests + ${project.version} + + + org.secretflow + dataproxy-manager + ${project.version} + + + org.secretflow + dataproxy-server + ${project.version} + + + org.secretflow + dataproxy-service + ${project.version} + + + + org.projectlombok + lombok + ${lombok.version} + + + javax.annotation + javax.annotation-api + ${javax.version} + + + com.google.guava + guava + ${guava.version} + + + io.netty + netty-all + ${netty-all.version} + + + io.netty + netty-handler + ${netty-all.version} + + + io.netty + netty-tcnative-boringssl-static + ${netty-tcnative-boringssl-static.version} + + + + com.google.protobuf + protobuf-java + ${protobuf.version} + + + com.google.protobuf + protobuf-java-util + ${protobuf.version} + + + io.grpc + grpc-netty-shaded + ${grpc.version} + + + + + + + org.apache.commons + commons-compress + ${commons-compress.version} + + + + + + + + org.apache.arrow + arrow-vector + ${arrow.version} + + + org.apache.arrow + arrow-dataset + ${arrow.version} + + + org.apache.arrow + arrow-memory-netty + ${arrow.version} + + + org.apache.arrow + arrow-format + ${arrow.version} + + + org.apache.arrow + flight-core + ${arrow.version} + + + io.netty + netty-transport-native-unix-common + + + io.netty + netty-transport-native-kqueue + + + io.netty + netty-transport-native-epoll + + + io.netty + netty-handler + + + com.fasterxml.jackson.core + * + + + + + org.apache.arrow + flight-grpc + ${arrow.version} + + + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + ${jackson.version} + + + com.fasterxml.jackson.core + jackson-core + ${jackson.version} + + + com.fasterxml.jackson.core + jackson-databind + ${jackson.version} + + + com.fasterxml.jackson.core + jackson-annotations + ${jackson.version} + + + + + com.squareup.okio + okio + ${okio.version} + + + org.apache.commons + commons-lang3 + ${commons-lang3.version} + + + org.apache.commons + commons-collections4 + ${commons-collections4.version} + + + org.yaml + snakeyaml + ${snakeyaml.version} + + + commons-io + commons-io + 2.11.0 + + + com.opencsv + opencsv + ${opencsv.version} + + + com.github.ben-manes.caffeine + caffeine + ${caffeine.version} + + + + + javax.validation + validation-api + ${validation-api.version} + + + org.hibernate.validator + hibernate-validator + ${hibernate-validator.version} + + + + + + org.aspectj + aspectjweaver + ${aspectj.version} + + + org.aspectj + aspectjrt + ${aspectj.version} + + + + + + com.zaxxer + HikariCP + ${hikaricp.version} + + + com.mysql + mysql-connector-j + ${mysql-connector-j.version} + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + + + org.slf4j + slf4j-reload4j + + + jdk.tools + jdk.tools + + + org.apache.avro + avro + + + org.codehaus.jettison + jettison + + + org.apache.zookeeper + zookeeper + + + org.xerial.snappy + snappy-java + + + + + org.apache.hadoop + hadoop-aws + ${hadoop.version} + + + org.apache.hadoop + hadoop-aliyun + ${hadoop.version} + + + + + com.aliyun.odps + odps-sdk-core + ${odps-code.version} + + + org.codehaus.jackson + jackson-mapper-asl + + + + + + org.xerial.snappy + snappy-java + 1.1.10.5 + + + + com.aliyun.oss + aliyun-sdk-oss + 3.17.3 + + + org.apache.httpcomponents + httpclient + + + commons-beanutils + commons-beanutils + + + org.apache.commons + commons-lang3 + + + + javax.xml.bind + jaxb-api + + + org.slf4j + slf4j-api + + + com.google.code.gson + gson + + + org.apache.httpcomponents + httpcore + + + commons-logging + commons-logging + + + + org.codehaus.jettison + jettison + + + + + + + + + + logback-classic + ch.qos.logback + ${logback-classic.version} + + + logback-core + ch.qos.logback + ${logback-classic.version} + + + org.springframework.boot + spring-boot-starter-validation + + + logback-classic + ch.qos.logback + + + logback-core + ch.qos.logback + + + + + org.springframework.boot + spring-boot-starter-test + test + + + + + + + kr.motd.maven + os-maven-plugin + ${plugin.os.version} + + + + + + org.xolstice.maven.plugins + protobuf-maven-plugin + ${plugin.protobuf.version} + + false + src/main/java + com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} + + grpc-java + io.grpc:protoc-gen-grpc-java:${grpc.version}:exe:${os.detected.classifier} + + + + + + compile + compile-custom + + + + + + + + \ No newline at end of file diff --git a/proto/kuscia/common.proto b/proto/kuscia/common.proto new file mode 100644 index 0000000..a8bdc72 --- /dev/null +++ b/proto/kuscia/common.proto @@ -0,0 +1,59 @@ +// Copyright 2023 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package kuscia.proto.api.v1alpha1; + +import "google/protobuf/any.proto"; + +option go_package = "github.com/secretflow/kuscia/proto/api/v1alpha1"; +option java_package = "org.secretflow.v1alpha1.common"; + +// RequestHeader carries the user custom headers. +message RequestHeader { + // Custom headers used to record custom information. + map custom_headers = 1; +} + +// Status carries the response status information. +// Reference: https://github.com/grpc/grpc/blob/master/src/proto/grpc/status/status.proto +message Status { + // The status code, which should be one of google rpc code or custom code. + int32 code = 1; + // Message for recording the error information. + string message = 2; + // A list of messages that carry the additional supplementary error details. + repeated google.protobuf.Any details = 3; +} + +message Partition { + // enum path, odps + string type = 1; + repeated DataColumn fields = 2; +} + +// DataColumn defines the column of data. +message DataColumn { + string name = 1; + // enum: string integer float datetime timestamp + string type = 2; + // The description of column + string comment = 3; +} + +enum FileFormat { + UNKNOWN = 0; + CSV = 1; +} \ No newline at end of file diff --git a/proto/kuscia/domaindata.proto b/proto/kuscia/domaindata.proto new file mode 100644 index 0000000..1d8379a --- /dev/null +++ b/proto/kuscia/domaindata.proto @@ -0,0 +1,154 @@ +// Copyright 2023 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package kuscia.proto.api.v1alpha1.datamesh; + +import "kuscia/common.proto"; + +option go_package = "github.com/secretflow/kuscia/proto/api/v1alpha1/datamesh"; +option java_package = "org.secretflow.v1alpha1.kusciaapi"; + +service DomainDataService { + rpc CreateDomainData(CreateDomainDataRequest) returns (CreateDomainDataResponse); + + rpc QueryDomainData(QueryDomainDataRequest) returns (QueryDomainDataResponse); + + rpc UpdateDomainData(UpdateDomainDataRequest) returns (UpdateDomainDataResponse); + + rpc DeleteDomainData(DeleteDomainDataRequest) returns (DeleteDomainDataResponse); +} + +message CreateDomainDataRequest { + RequestHeader header = 1; + // Optional, The domaindata_id would be generated by server if the domaindata_id is empty. + // The unique identity of domaindata, it couldn't duplicate in the same domain. + string domaindata_id = 2; + // The human readable, it could duplicate in the domain. + string name = 3; + // Enum: table,model,rule,report,unknown + string type = 4; + // The relative_uri is relative to the datasource URI, The datasourceURI appends relative_uri is the domaindataURI. + // e.g. the relative_uri is "train/table.csv" + // the URI of datasource is "/home/data" + // the URI of domaindata is "/home/data/train/table.csv" + string relative_uri = 5; + // Optional, server would use default datasource if datasource_id is empty. + // The datasource is where the domain is stored. + string datasource_id = 6; + // Optional, The attributes of the domaindata, this field use as a extra field, User could set + // this field to any data what they need. + map attributes = 7; + // Optional, Partition not support now + Partition partition = 8; + // This field must be set if the type is 'table', + // the columns describe the table's schema information. + repeated DataColumn columns = 9; + // Optional , The vendor is the one who outputs the domain data, it may be the SecretFlow engine, + // another vendor's engine, or manually registered. it's could be manual, secretflow or other vendor string. + string vendor = 10; +} + +message CreateDomainDataResponse { + Status status = 1; + CreateDomainDataResponseData data = 2; +} + +message CreateDomainDataResponseData { + // ID of created domaindata + string domaindata_id = 1; +} + +message UpdateDomainDataRequest { + RequestHeader header = 1; + // Mandatory, The domaindata_id indicate which domaindata would be updated. + string domaindata_id = 2; + // the human-readable name + string name = 3; + // Enum: table,model,rule,report,unknown + string type = 4; + // The relative_uri is relative to the datasource URI, The datasourceURI appends relative_uri is the domaindataURI. + // e.g. the relative_uri is "train/table.csv" + // the URI of datasource is "/home/data" + // the URI of domaindata is "/home/data/train/table.csv" + string relative_uri = 5; + // The datasource is where the domain is stored. + string datasource_id = 6; + // The attributes of the domaindata, this field use as a extra field, User could set + // this field to any data that they need. + map attributes = 7; + // Partition not support now + Partition partition = 8; + // columns: the columns' information of the table data. + // this field must be set if the type is 'table' + repeated DataColumn columns = 9; + // The vendor is the one who outputs the domain data, it may be the secretFlow engine, + // another vendor's engine, or manually registered. enum manual,secretflow. + string vendor = 10; +} + +message UpdateDomainDataResponse { + Status status = 1; +} + +message DeleteDomainDataRequest { + RequestHeader header = 1; + // domaindata id + string domaindata_id = 2; +} + +message DeleteDomainDataResponse { + Status status = 1; +} + +message QueryDomainDataRequest { + RequestHeader header = 1; + string domaindata_id = 2; +} + +message QueryDomainDataResponse { + Status status = 1; + DomainData data = 2; +} + +message DomainData { + // domaindata_id is the identification of domaindata, it couldn't duplicate in the domain. + string domaindata_id = 1; + // The human readable, it could duplicate in the domain. + string name = 2; + // domaindata type , Enum: table,model,rule,report,unknown + string type = 3; + // The relative_uri is relative to the datasource URI, The datasourceURI appends relative_uri is the domaindataURI. + // e.g. the relative_uri is "train/table.csv" + // the URI of datasource is "/home/data" + // the URI of domaindata is "/home/data/train/table.csv" + string relative_uri = 4; + // datasource_id is the identity of the domaindatasource, the domaindatasource that storage the domaindata file. + string datasource_id = 5; + // attributes : the attributes of the domaindata, this field use as a extra field, User could set + // this field to any data what they need. + map attributes = 6; + // Partition not support now, + Partition partition = 7; + // This field must be set if the type is 'table', + // The columns describe the table's schema information. + repeated DataColumn columns = 8; + // The vendor is the one who outputs the domain data, it may be the SecretFlow engine, + // another vendor's engine, or manually registered. it's could be manual, secretflow or other vendor string. + string vendor = 9; + + // if the data is stored with file format, file_format describe file format + FileFormat file_format = 10; +} \ No newline at end of file diff --git a/proto/kuscia/domaindatasource.proto b/proto/kuscia/domaindatasource.proto new file mode 100644 index 0000000..ac8380b --- /dev/null +++ b/proto/kuscia/domaindatasource.proto @@ -0,0 +1,156 @@ +// Copyright 2023 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package kuscia.proto.api.v1alpha1.datamesh; + +import "kuscia/common.proto"; + +option go_package = "github.com/secretflow/kuscia/proto/api/v1alpha1/datamesh"; +option java_package = "org.secretflow.v1alpha1.kusciaapi"; + +service DomainDataSourceService { + rpc CreateDomainDataSource(CreateDomainDataSourceRequest) returns (CreateDomainDataSourceResponse); + + rpc QueryDomainDataSource(QueryDomainDataSourceRequest) returns (QueryDomainDataSourceResponse); + + rpc UpdateDomainDataSource(UpdateDomainDataSourceRequest) returns (UpdateDomainDataSourceResponse); + + rpc DeleteDomainDataSource(DeleteDomainDataSourceRequest) returns (DeleteDomainDataSourceResponse); +} + +message CreateDomainDataSourceRequest { + RequestHeader header = 1; + string datasource_id = 2; + string name = 3; + string type = 4; + DataSourceInfo info = 5; +} + +message CreateDomainDataSourceResponse { + Status status = 1; + CreateDomainDataSourceResponseData data = 2; +} + +message CreateDomainDataSourceResponseData { + // id of created datasource + string datasource_id = 1; +} + +message UpdateDomainDataSourceRequest { + RequestHeader header = 1; + string datasource_id = 2; + string name = 3; + string type = 4; + DataSourceInfo info = 5; +} + +message UpdateDomainDataSourceResponse { + Status status = 1; +} + +message DeleteDomainDataSourceRequest { + RequestHeader header = 1; + // datasource id + string datasource_id = 2; +} + +message DeleteDomainDataSourceResponse { + Status status = 1; +} + +message QueryDomainDataSourceRequest { + RequestHeader header = 1; + string datasource_id = 2; +} + +message QueryDomainDataSourceResponse { + Status status = 1; + DomainDataSource data = 2; +} + +message DomainDataSource { + // datasource id + string datasource_id = 1; + // datasource name + string name = 2; + // datasource type, enum [localfs, oss] , oss not support now + string type = 3; + // datasource status, enum [Available,Unavailable] + string status = 4; + // datasource info + DataSourceInfo info = 5; +} + +message DataSourceInfo { + // there is only one of the localfs or the oss in the DataSourceInfo + // LocalDataSourceInfo is exist only if the type of datasource is localfs + // OssDataSourceInfo is exist only if the type of datasource is oss + LocalDataSourceInfo localfs = 1; + // oss not support now + OssDataSourceInfo oss = 2; + + // Relational database info, such as Mysql/oracle/postgres + DatabaseDataSourceInfo database = 3; + // aliyun odps(MaxCompute) + OdpsDataSourceInfo odps = 4; +} + +// datasource info for local path +message LocalDataSourceInfo { + // the local path of datasource + string path = 1; +} + +// datasource info for oss +// oss not support now +message OssDataSourceInfo { + // endpoint oss.xxx.cn-xxx.com or 127.0.0.1:9000 + string endpoint = 1; + // the bucket name of the oss datasource + string bucket = 2; + // the prefix of the oss datasource. e.g. data/traindata/ + string prefix = 3; + // access key + string access_key_id = 4; + // access secret + string access_key_secret = 5; + // virtualhost is the same as AliyunOSS's virtualhost , default true + bool virtualhost = 6; + // optional enum[s3v2,s3v4] + string version = 7; + // optional enum[oss,minio] , default oss + string storage_type = 8; +} + +// datasource info for Mysql/oracle/postgres +message DatabaseDataSourceInfo { + // endpoint for database, such as localhost:3306 + string endpoint = 1; + // user name of account to visit database + string user = 2; + // password of account to visit database + string password = 3; + // database + string database = 4; +} + +message OdpsDataSourceInfo { + // ODPS address such as "https://odps.xxx.cn-xxx.com" + string endpoint = 1; + string project = 2; + string access_key_id = 3; + string access_key_secret = 4; +} \ No newline at end of file diff --git a/proto/kuscia/flightdm.proto b/proto/kuscia/flightdm.proto new file mode 100644 index 0000000..30d30fc --- /dev/null +++ b/proto/kuscia/flightdm.proto @@ -0,0 +1,132 @@ +// Copyright 2023 Ant Group Co., Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package kuscia.proto.api.v1alpha1.datamesh; + +import "kuscia/common.proto"; +import "kuscia/domaindata.proto"; +import "kuscia/domaindatasource.proto"; + +option go_package = "github.com/secretflow/kuscia/proto/api/v1alpha1/datamesh"; +option java_package = "org.secretflow.v1alpha1.kusciaapi"; + +// write options of csv file +message CSVWriteOptions { + // field delimiter of csv file, default is comma + string field_delimiter = 1; +} + +// write options varies by file format +message FileWriteOptions { + oneof Options { + CSVWriteOptions csv_options = 1; + } +} + +// ContentFormat describe the schema of ArrowRecordBatch response from DataProxy +// Only for Table Type, the schema is defined by DomainData's columns +// And for rest types, the schema is a dummy schema which only contains one column of arrow::Binary type +enum ContentType{ + Table = 0; + // read raw data from datasource + RAW = 1; + // convert data to csv file format, and return file content as dummy schema + CSV = 2; +} + +// call with GetSchema, return SchemaResult, extension attributes are set with metadata +message CommandGetDomainDataSchema { + string domaindata_id = 1; +} + +// call GetFlightInfo with CommandDomainDataQuery, return TicketDomainDataQuery +// and then call DoGet with TicketDomainDataQuery +message CommandDomainDataQuery { + string domaindata_id = 1; + // column name of DomainData's columns + repeated string columns = 2; + // expected result format + ContentType content_type = 3; + FileWriteOptions file_write_options = 4; + string partition_spec = 5; +} + +// call GetFlightInfo with CommandDomainDataUpdate, return TicketDomainDataQuery which is an update query +// and then call DoPut with TicketDomainDataQuery +message CommandDomainDataUpdate { + string domaindata_id = 1; + // create an nonexistent domaindata and get a update ticket + CreateDomainDataRequest domaindata_request = 2; + ContentType content_type = 3; + // for domaindata stored with file format , you can specify file_write_options + FileWriteOptions file_write_options = 4; + // extra options varies by datasource type + map extra_options = 5; + // specific the partition column and value, such as "dmdt=20240520" + string partition_spec = 6; +} + +message TicketDomainDataQuery { + // a unique identifier related to a DomainData query + string domaindata_handle = 1; +} + +// call DoAction with ActionCreateDomainDataRequest, return ActionCreateDomainDataResponse +message ActionCreateDomainDataRequest { + CreateDomainDataRequest request = 1; +} + +message ActionCreateDomainDataResponse { + CreateDomainDataResponse response = 1; +} + +// call DoAction with ActionQueryDomainDataRequest, return ActionQueryDomainDataResponse +message ActionQueryDomainDataRequest { + QueryDomainDataRequest request = 1; +} + +message ActionQueryDomainDataResponse { + QueryDomainDataResponse response = 1; +} + +// call DoAction with ActionUpdateDomainDataRequest, return ActionUpdateDomainDataResponse +message ActionUpdateDomainDataRequest { + UpdateDomainDataRequest request = 1; +} + +message ActionUpdateDomainDataResponse { + UpdateDomainDataResponse response = 1; +} + +// call DoAction with ActionDeleteDomainDataRequest, return ActionDeleteDomainDataResponse +message ActionDeleteDomainDataRequest { + DeleteDomainDataRequest request = 1; + // not supported now + bool physical_deletion = 2; +} + +message ActionDeleteDomainDataResponse { + DeleteDomainDataResponse response = 1; +} + +// call DoAction with ActionCreateDomainDataSourceRequest, return ActionCreateDomainDataSourceResponse +message ActionCreateDomainDataSourceRequest { + CreateDomainDataSourceRequest request = 1; +} + +message ActionCreateDomainDataSourceResponse { + CreateDomainDataSourceResponse response = 1; +} diff --git a/proto/kuscia/flightinner.proto b/proto/kuscia/flightinner.proto new file mode 100644 index 0000000..d280c82 --- /dev/null +++ b/proto/kuscia/flightinner.proto @@ -0,0 +1,21 @@ +syntax = "proto3"; + +package kuscia.proto.api.v1alpha1.datamesh; + +import "kuscia/domaindata.proto"; +import "kuscia/flightdm.proto"; +import "kuscia/domaindatasource.proto"; + +option java_package = "org.secretflow.v1alpha1.kusciaapi"; + +message CommandDataMeshQuery { + CommandDomainDataQuery query = 1; + DomainData domaindata = 2 ; + DomainDataSource datasource = 3; +} + +message CommandDataMeshUpdate { + CommandDomainDataUpdate update = 1; + DomainData domaindata = 2 ; + DomainDataSource datasource = 3; +} \ No newline at end of file diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100755 index 0000000..6a8b6ea --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +set -e + +WORK_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd -P) + +if [[ $BUILD_IN_IMAGE == true ]]; then + docker run --rm -v ${WORK_DIR}:/home/admin/dev secretflow-registry.cn-hangzhou.cr.aliyuncs.com/secretflow/java-dev:0.2 mvn clean package -Dmaven.test.skip +else + mvn clean package -Dmaven.test.skip +fi diff --git a/scripts/build_base.sh b/scripts/build_base.sh new file mode 100644 index 0000000..1bbd30d --- /dev/null +++ b/scripts/build_base.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +VERSION_TAG=0.1 +tag=${VERSION_TAG} +local_image=dataproxy-base-lite:$tag +docker build -f ./build/Dockerfiles/base.Dockerfile --platform linux/amd64 -t "$local_image" . +echo "local image: $local_image" \ No newline at end of file diff --git a/scripts/build_image.sh b/scripts/build_image.sh new file mode 100755 index 0000000..030cadf --- /dev/null +++ b/scripts/build_image.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +github_flag=true +./scripts/build.sh + +DATETIME=$(date +"%Y%m%d%H%M%S") +git fetch --tags + +# shellcheck disable=SC2046 +VERSION_TAG="$(git describe --tags $(git rev-list --tags --max-count=1))" +commit_id=$(git log -n 1 --pretty=oneline | awk '{print $1}' | cut -b 1-6) +tag=${VERSION_TAG}-${DATETIME}-"${commit_id}" +local_image=dataproxy:${tag} +echo "$commit_id" + +BUILDER_EXISTS=$( + docker buildx inspect dataproxy_image_buildx >/dev/null 2>&1 + echo $? +) + +if [ "$BUILDER_EXISTS" -eq 0 ]; then + echo "existing buildx builder: dataproxy_image_buildx" + docker buildx use dataproxy_image_buildx +else + echo "creating new buildx builder: dataproxy_image_buildx" + docker buildx create --name dataproxy_image_buildx --use +fi + +if [[ "$github_flag" == "true" ]]; then + echo "github_flag is true" + docker buildx build \ + --platform linux/arm64,linux/amd64 \ + --tag "${local_image}" \ + -f ./build/Dockerfiles/dataproxy.Dockerfile . \ + --load +fi diff --git a/scripts/start_dp.sh b/scripts/start_dp.sh new file mode 100644 index 0000000..8700fa1 --- /dev/null +++ b/scripts/start_dp.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# +# Copyright 2023 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -ex + +for i in "$@" +do +case $i in + --serving_config_file=*) + CONFIG_FILE="${i#*=}" +esac +done + +export DP_CONFIG_FILE=$CONFIG_FILE +java -Dsun.net.http.allowRestrictedHeaders=true --add-opens=java.base/java.nio=ALL-UNNAMED -jar ./dataproxy.jar \ No newline at end of file