diff --git a/be/src/olap/push_handler.cpp b/be/src/olap/push_handler.cpp index 575b002b2f6086..99568d47298aaa 100644 --- a/be/src/olap/push_handler.cpp +++ b/be/src/olap/push_handler.cpp @@ -34,15 +34,16 @@ #include #include #include +#include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/config.h" #include "common/logging.h" #include "common/status.h" +#include "io/hdfs_builder.h" #include "olap/delete_handler.h" #include "olap/olap_define.h" #include "olap/rowset/pending_rowset_helper.h" -#include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" #include "olap/schema.h" @@ -53,10 +54,11 @@ #include "olap/txn_manager.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" -#include "util/runtime_profile.h" #include "util/time.h" #include "vec/core/block.h" +#include "vec/core/column_with_type_and_name.h" #include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/data_type_nullable.h" #include "vec/exec/format/parquet/vparquet_reader.h" #include "vec/exprs/vexpr_context.h" #include "vec/functions/simple_function_factory.h" @@ -352,8 +354,12 @@ PushBrokerReader::PushBrokerReader(const Schema* schema, const TBrokerScanRange& _file_params.expr_of_dest_slot = _params.expr_of_dest_slot; _file_params.dest_sid_to_src_sid_without_trans = _params.dest_sid_to_src_sid_without_trans; _file_params.strict_mode = _params.strict_mode; - _file_params.__isset.broker_addresses = true; - _file_params.broker_addresses = t_scan_range.broker_addresses; + if (_ranges[0].file_type == TFileType::FILE_HDFS) { + _file_params.hdfs_params = parse_properties(_params.properties); + } else { + _file_params.__isset.broker_addresses = true; + _file_params.broker_addresses = t_scan_range.broker_addresses; + } for (const auto& range : _ranges) { TFileRangeDesc file_range; @@ -482,17 +488,36 @@ Status PushBrokerReader::_cast_to_input_block() { auto& arg = _src_block_ptr->get_by_name(slot_desc->col_name()); // remove nullable here, let the get_function decide whether nullable auto return_type = slot_desc->get_data_type_ptr(); - vectorized::ColumnsWithTypeAndName arguments { - arg, - {vectorized::DataTypeString().create_column_const( - arg.column->size(), remove_nullable(return_type)->get_family_name()), - std::make_shared(), ""}}; - auto func_cast = vectorized::SimpleFunctionFactory::instance().get_function( - "CAST", arguments, return_type); idx = _src_block_name_to_idx[slot_desc->col_name()]; - RETURN_IF_ERROR( - func_cast->execute(nullptr, *_src_block_ptr, {idx}, idx, arg.column->size())); - _src_block_ptr->get_by_position(idx).type = std::move(return_type); + // bitmap convert:src -> to_base64 -> bitmap_from_base64 + if (slot_desc->type().is_bitmap_type()) { + auto base64_return_type = vectorized::DataTypeFactory::instance().create_data_type( + vectorized::DataTypeString().get_type_as_type_descriptor(), + slot_desc->is_nullable()); + auto func_to_base64 = vectorized::SimpleFunctionFactory::instance().get_function( + "to_base64", {arg}, base64_return_type); + RETURN_IF_ERROR(func_to_base64->execute(nullptr, *_src_block_ptr, {idx}, idx, + arg.column->size())); + _src_block_ptr->get_by_position(idx).type = std::move(base64_return_type); + auto& arg_base64 = _src_block_ptr->get_by_name(slot_desc->col_name()); + auto func_bitmap_from_base64 = + vectorized::SimpleFunctionFactory::instance().get_function( + "bitmap_from_base64", {arg_base64}, return_type); + RETURN_IF_ERROR(func_bitmap_from_base64->execute(nullptr, *_src_block_ptr, {idx}, idx, + arg_base64.column->size())); + _src_block_ptr->get_by_position(idx).type = std::move(return_type); + } else { + vectorized::ColumnsWithTypeAndName arguments { + arg, + {vectorized::DataTypeString().create_column_const( + arg.column->size(), remove_nullable(return_type)->get_family_name()), + std::make_shared(), ""}}; + auto func_cast = vectorized::SimpleFunctionFactory::instance().get_function( + "CAST", arguments, return_type); + RETURN_IF_ERROR( + func_cast->execute(nullptr, *_src_block_ptr, {idx}, idx, arg.column->size())); + _src_block_ptr->get_by_position(idx).type = std::move(return_type); + } } return Status::OK(); } diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 01d981efdd97b4..a270501560d743 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -633,6 +633,10 @@ public class Config extends ConfigBase { @ConfField(description = {"Yarn 配置文件的路径", "Yarn config path"}) public static String yarn_config_dir = System.getenv("DORIS_HOME") + "/lib/yarn-config"; + @ConfField(mutable = true, masterOnly = true, description = {"Ingestion load 的默认超时时间,单位是秒。", + "Default timeout for ingestion load job, in seconds."}) + public static int ingestion_load_default_timeout_second = 86400; // 1 day + @ConfField(mutable = true, masterOnly = true, description = {"Sync job 的最大提交间隔,单位是秒。", "Maximal intervals between two sync job's commits."}) public static long sync_commit_interval_second = 10; diff --git a/fe/fe-common/src/main/java/org/apache/doris/sparkdpp/EtlJobConfig.java b/fe/fe-common/src/main/java/org/apache/doris/sparkdpp/EtlJobConfig.java index c59901d383b648..8d9d5de54b59f1 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/sparkdpp/EtlJobConfig.java +++ b/fe/fe-common/src/main/java/org/apache/doris/sparkdpp/EtlJobConfig.java @@ -371,14 +371,17 @@ public static class EtlIndex implements Serializable { public String indexType; @SerializedName(value = "isBaseIndex") public boolean isBaseIndex; + @SerializedName(value = "schemaVersion") + public int schemaVersion; public EtlIndex(long indexId, List etlColumns, int schemaHash, - String indexType, boolean isBaseIndex) { + String indexType, boolean isBaseIndex, int schemaVersion) { this.indexId = indexId; this.columns = etlColumns; this.schemaHash = schemaHash; this.indexType = indexType; this.isBaseIndex = isBaseIndex; + this.schemaVersion = schemaVersion; } public EtlColumn getColumn(String name) { @@ -398,6 +401,7 @@ public String toString() { + ", schemaHash=" + schemaHash + ", indexType='" + indexType + '\'' + ", isBaseIndex=" + isBaseIndex + + ", schemaVersion=" + schemaVersion + '}'; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/SparkResource.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/SparkResource.java index 704d8e512d7f3c..2af2a9b4a90df2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/SparkResource.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/SparkResource.java @@ -70,6 +70,7 @@ * * DROP RESOURCE "spark0"; */ +@Deprecated public class SparkResource extends Resource { private static final Logger LOG = LogManager.getLogger(SparkResource.class); diff --git a/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/LoadAction.java b/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/LoadAction.java index 2f9efc1ed1b1bf..b62ac7832e7a88 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/LoadAction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/httpv2/rest/LoadAction.java @@ -27,13 +27,21 @@ import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; import org.apache.doris.common.LoadException; +import org.apache.doris.common.MetaNotFoundException; import org.apache.doris.common.Pair; +import org.apache.doris.common.QuotaExceedException; import org.apache.doris.common.UserException; import org.apache.doris.common.util.DebugPointUtil; +import org.apache.doris.datasource.InternalCatalog; import org.apache.doris.httpv2.entity.ResponseEntityBuilder; import org.apache.doris.httpv2.entity.RestBaseResult; import org.apache.doris.httpv2.exception.UnauthorizedException; +import org.apache.doris.httpv2.rest.manager.HttpUtils; +import org.apache.doris.load.FailMsg; import org.apache.doris.load.StreamLoadHandler; +import org.apache.doris.load.loadv2.IngestionLoadJob; +import org.apache.doris.load.loadv2.LoadJob; +import org.apache.doris.load.loadv2.LoadManager; import org.apache.doris.mysql.privilege.Auth; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.planner.GroupCommitPlanner; @@ -45,9 +53,14 @@ import org.apache.doris.system.BeSelectionPolicy; import org.apache.doris.system.SystemInfoService; import org.apache.doris.thrift.TNetworkAddress; +import org.apache.doris.transaction.BeginTransactionException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.json.JsonMapper; import com.google.common.base.Strings; import io.netty.handler.codec.http.HttpHeaderNames; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.validator.routines.InetAddressValidator; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -59,10 +72,14 @@ import org.springframework.web.bind.annotation.RestController; import org.springframework.web.servlet.view.RedirectView; +import java.io.IOException; import java.net.InetAddress; import java.net.URI; import java.util.Enumeration; +import java.util.HashMap; +import java.util.LinkedList; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Set; import javax.servlet.http.HttpServletRequest; @@ -694,4 +711,198 @@ private Backend selectBackendForGroupCommit(String clusterName, HttpServletReque } return backend; } + + /** + * Request body example: + * { + * "label": "test", + * "tableToPartition": { + * "tbl_test_spark_load": ["p1","p2"] + * }, + * "properties": { + * "strict_mode": "true", + * "timeout": 3600000 + * } + * } + * + */ + @RequestMapping(path = "/api/ingestion_load/{" + CATALOG_KEY + "}/{" + DB_KEY + + "}/_create", method = RequestMethod.POST) + public Object createIngestionLoad(HttpServletRequest request, HttpServletResponse response, + @PathVariable(value = CATALOG_KEY) String catalog, + @PathVariable(value = DB_KEY) String db) { + if (needRedirect(request.getScheme())) { + return redirectToHttps(request); + } + + executeCheckPassword(request, response); + + if (!InternalCatalog.INTERNAL_CATALOG_NAME.equals(catalog)) { + return ResponseEntityBuilder.okWithCommonError("Only support internal catalog. " + + "Current catalog is " + catalog); + } + + Object redirectView = redirectToMaster(request, response); + if (redirectView != null) { + return redirectView; + } + + String fullDbName = getFullDbName(db); + + Map resultMap = new HashMap<>(); + + try { + + String body = HttpUtils.getBody(request); + JsonMapper mapper = JsonMapper.builder().build(); + JsonNode jsonNode = mapper.reader().readTree(body); + + String label = jsonNode.get("label").asText(); + Map> tableToPartition = mapper.reader() + .readValue(jsonNode.get("tableToPartition").traverse(), + new TypeReference>>() { + }); + List tableNames = new LinkedList<>(tableToPartition.keySet()); + for (String tableName : tableNames) { + checkTblAuth(ConnectContext.get().getCurrentUserIdentity(), fullDbName, tableName, PrivPredicate.LOAD); + } + + Map properties = new HashMap<>(); + if (jsonNode.hasNonNull("properties")) { + properties = mapper.readValue(jsonNode.get("properties").traverse(), + new TypeReference>() { + }); + } + + executeCreateAndStartIngestionLoad(fullDbName, label, tableNames, properties, tableToPartition, resultMap, + ConnectContext.get().getCurrentUserIdentity()); + + } catch (Exception e) { + LOG.warn("create ingestion load job failed, db: {}, err: {}", db, e.getMessage()); + return ResponseEntityBuilder.okWithCommonError(e.getMessage()); + } + + return ResponseEntityBuilder.ok(resultMap); + + } + + private void executeCreateAndStartIngestionLoad(String dbName, String label, List tableNames, + Map properties, + Map> tableToPartition, + Map resultMap, UserIdentity userInfo) + throws DdlException, BeginTransactionException, MetaNotFoundException, AnalysisException, + QuotaExceedException, LoadException { + + long loadId = -1; + try { + + LoadManager loadManager = Env.getCurrentEnv().getLoadManager(); + loadId = loadManager.createIngestionLoadJob(dbName, label, tableNames, properties, userInfo); + IngestionLoadJob loadJob = (IngestionLoadJob) loadManager.getLoadJob(loadId); + resultMap.put("loadId", loadId); + + long txnId = loadJob.beginTransaction(); + resultMap.put("txnId", txnId); + + Map loadMeta = loadJob.getLoadMeta(tableToPartition); + resultMap.put("dbId", loadMeta.get("dbId")); + resultMap.put("signature", loadMeta.get("signature")); + resultMap.put("tableMeta", loadMeta.get("tableMeta")); + + loadJob.startEtlJob(); + + } catch (DdlException | BeginTransactionException | MetaNotFoundException | AnalysisException + | QuotaExceedException | LoadException e) { + LOG.warn("create ingestion load job failed, db: {}, load id: {}, err: {}", dbName, loadId, e.getMessage()); + if (loadId != -1L) { + try { + Env.getCurrentEnv().getLoadManager().getLoadJob(loadId).cancelJob( + new FailMsg(FailMsg.CancelType.UNKNOWN, StringUtils.defaultIfBlank(e.getMessage(), ""))); + } catch (DdlException ex) { + LOG.warn("cancel ingestion load failed, db: {}, load id: {}, err: {}", dbName, loadId, + e.getMessage()); + } + } + throw e; + } + + } + + /** + * Request body example: + * { + * "statusInfo": { + * "msg": "", + * "hadoopProperties": "{\"fs.defaultFS\":\"hdfs://hadoop01:8020\",\"hadoop.username\":\"hadoop\"}", + * "appId": "local-1723088141438", + * "filePathToSize": "{\"hdfs://hadoop01:8020/spark-load/jobs/25054/test/36019/dpp_result.json\":179, + * \"hdfs://hadoop01:8020/spark-load/jobs/25054/test/36019/load_meta.json\":3441,\"hdfs://hadoop01:8020 + * /spark-load/jobs/25054/test/36019/V1.test.25056.29373.25057.0.366242211.parquet\":5745}", + * "dppResult": "{\"isSuccess\":true,\"failedReason\":\"\",\"scannedRows\":10,\"fileNumber\":1, + * \"fileSize\":2441,\"normalRows\":10,\"abnormalRows\":0,\"unselectRows\":0,\"partialAbnormalRows\":\"[]\", + * \"scannedBytes\":0}", + * "status": "SUCCESS" + * }, + * "loadId": 36018 + * } + * + */ + @RequestMapping(path = "/api/ingestion_load/{" + CATALOG_KEY + "}/{" + DB_KEY + + "}/_update", method = RequestMethod.POST) + public Object updateIngestionLoad(HttpServletRequest request, HttpServletResponse response, + @PathVariable(value = CATALOG_KEY) String catalog, + @PathVariable(value = DB_KEY) String db) { + if (needRedirect(request.getScheme())) { + return redirectToHttps(request); + } + + executeCheckPassword(request, response); + + if (!InternalCatalog.INTERNAL_CATALOG_NAME.equals(catalog)) { + return ResponseEntityBuilder.okWithCommonError("Only support internal catalog. " + + "Current catalog is " + catalog); + } + + Object redirectView = redirectToMaster(request, response); + if (redirectView != null) { + return redirectView; + } + + String fullDbName = getFullDbName(db); + + long loadId = -1; + try { + + String body = HttpUtils.getBody(request); + JsonMapper mapper = JsonMapper.builder().build(); + JsonNode jsonNode = mapper.readTree(body); + LoadJob loadJob = null; + + if (jsonNode.hasNonNull("loadId")) { + loadId = jsonNode.get("loadId").asLong(); + loadJob = Env.getCurrentEnv().getLoadManager().getLoadJob(loadId); + } + + if (loadJob == null) { + return ResponseEntityBuilder.okWithCommonError("load job not exists, load id: " + loadId); + } + + IngestionLoadJob ingestionLoadJob = (IngestionLoadJob) loadJob; + Set tableNames = ingestionLoadJob.getTableNames(); + for (String tableName : tableNames) { + checkTblAuth(ConnectContext.get().getCurrentUserIdentity(), fullDbName, tableName, PrivPredicate.LOAD); + } + Map statusInfo = mapper.readValue(jsonNode.get("statusInfo").traverse(), + new TypeReference>() { + }); + ingestionLoadJob.updateJobStatus(statusInfo); + } catch (IOException | MetaNotFoundException | UnauthorizedException e) { + LOG.warn("cancel ingestion load job failed, db: {}, load id: {}, err: {}", db, loadId, e.getMessage()); + return ResponseEntityBuilder.okWithCommonError(e.getMessage()); + } + + return ResponseEntityBuilder.ok(); + + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/EtlJobType.java b/fe/fe-core/src/main/java/org/apache/doris/load/EtlJobType.java index 95333d0f0250b9..7eaa89c97d0850 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/EtlJobType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/EtlJobType.java @@ -28,5 +28,6 @@ public enum EtlJobType { LOCAL_FILE, // create by job scheduler,inner use INSERT_JOB, + INGESTION, UNKNOWN } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/IngestionLoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/IngestionLoadJob.java new file mode 100644 index 00000000000000..5af05d6b8231e9 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/IngestionLoadJob.java @@ -0,0 +1,1139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.load.loadv2; + +import org.apache.doris.analysis.CastExpr; +import org.apache.doris.analysis.DescriptorTable; +import org.apache.doris.analysis.Expr; +import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.analysis.SlotDescriptor; +import org.apache.doris.analysis.SlotRef; +import org.apache.doris.analysis.TupleDescriptor; +import org.apache.doris.analysis.UserIdentity; +import org.apache.doris.catalog.AggregateType; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.DistributionInfo; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.HashDistributionInfo; +import org.apache.doris.catalog.KeysType; +import org.apache.doris.catalog.MaterializedIndex; +import org.apache.doris.catalog.MaterializedIndexMeta; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.Partition; +import org.apache.doris.catalog.PartitionItem; +import org.apache.doris.catalog.PartitionKey; +import org.apache.doris.catalog.PartitionType; +import org.apache.doris.catalog.PrimitiveType; +import org.apache.doris.catalog.RangePartitionInfo; +import org.apache.doris.catalog.Replica; +import org.apache.doris.catalog.ScalarType; +import org.apache.doris.catalog.Table; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.catalog.Tablet; +import org.apache.doris.catalog.Type; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.DataQualityException; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.DuplicatedRequestException; +import org.apache.doris.common.LabelAlreadyUsedException; +import org.apache.doris.common.LoadException; +import org.apache.doris.common.MetaNotFoundException; +import org.apache.doris.common.Pair; +import org.apache.doris.common.QuotaExceedException; +import org.apache.doris.common.UserException; +import org.apache.doris.common.io.Text; +import org.apache.doris.common.util.LogBuilder; +import org.apache.doris.common.util.LogKey; +import org.apache.doris.common.util.MetaLockUtils; +import org.apache.doris.load.EtlJobType; +import org.apache.doris.load.EtlStatus; +import org.apache.doris.load.FailMsg; +import org.apache.doris.service.ExecuteEnv; +import org.apache.doris.service.FrontendOptions; +import org.apache.doris.sparkdpp.DppResult; +import org.apache.doris.sparkdpp.EtlJobConfig; +import org.apache.doris.task.AgentBatchTask; +import org.apache.doris.task.AgentTaskExecutor; +import org.apache.doris.task.AgentTaskQueue; +import org.apache.doris.task.PushTask; +import org.apache.doris.thrift.TBrokerRangeDesc; +import org.apache.doris.thrift.TBrokerScanRange; +import org.apache.doris.thrift.TBrokerScanRangeParams; +import org.apache.doris.thrift.TColumn; +import org.apache.doris.thrift.TDescriptorTable; +import org.apache.doris.thrift.TEtlState; +import org.apache.doris.thrift.TFileFormatType; +import org.apache.doris.thrift.TFileType; +import org.apache.doris.thrift.TPriority; +import org.apache.doris.thrift.TPushType; +import org.apache.doris.thrift.TUniqueId; +import org.apache.doris.transaction.BeginTransactionException; +import org.apache.doris.transaction.TabletCommitInfo; +import org.apache.doris.transaction.TabletQuorumFailedException; +import org.apache.doris.transaction.TransactionState; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Range; +import com.google.common.collect.Sets; +import com.google.gson.Gson; +import com.google.gson.annotations.SerializedName; +import com.google.gson.reflect.TypeToken; +import lombok.Setter; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.DataInput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Ingestion Load + *

+ * Load data file which has been pre-processed + *

+ * There are 4 steps in IngestionLoadJob: + * Step1: Outside system execute ingestion etl job. + * Step2: LoadEtlChecker will check ingestion etl job status periodically + * and send push tasks to be when ingestion etl job is finished. + * Step3: LoadLoadingChecker will check loading status periodically and commit transaction when push tasks are finished. + * Step4: PublishVersionDaemon will send publish version tasks to be and finish transaction. + */ +public class IngestionLoadJob extends LoadJob { + + public static final Logger LOG = LogManager.getLogger(IngestionLoadJob.class); + + @Setter + @SerializedName("ests") + private EtlStatus etlStatus; + + // members below updated when job state changed to loading + // { tableId.partitionId.indexId.bucket.schemaHash -> (etlFilePath, etlFileSize) } + @SerializedName(value = "tm2fi") + private final Map> tabletMetaToFileInfo = Maps.newHashMap(); + + @SerializedName(value = "hp") + private final Map hadoopProperties = new HashMap<>(); + + @SerializedName(value = "i2sv") + private final Map indexToSchemaVersion = new HashMap<>(); + + private final Map indexToSchemaHash = Maps.newHashMap(); + + private final Map filePathToSize = new HashMap<>(); + + private final Set finishedReplicas = Sets.newHashSet(); + private final Set quorumTablets = Sets.newHashSet(); + private final Set fullTablets = Sets.newHashSet(); + + private final List commitInfos = Lists.newArrayList(); + + private final Map> tableToLoadPartitions = Maps.newHashMap(); + + private final Map> tabletToSentReplicaPushTask = Maps.newHashMap(); + + private long etlStartTimestamp = -1; + + private long quorumFinishTimestamp = -1; + + private List loadTableIds = new ArrayList<>(); + + public IngestionLoadJob() { + super(EtlJobType.INGESTION); + } + + public IngestionLoadJob(long dbId, String label, List tableNames, UserIdentity userInfo) + throws LoadException { + super(EtlJobType.INGESTION, dbId, label); + this.loadTableIds = getLoadTableIds(tableNames); + this.userInfo = userInfo; + } + + @Override + public Set getTableNamesForShow() { + return Collections.emptySet(); + } + + @Override + public Set getTableNames() throws MetaNotFoundException { + Set result = Sets.newHashSet(); + Database database = Env.getCurrentInternalCatalog().getDbOrMetaException(dbId); + for (long tableId : loadTableIds) { + Table table = database.getTableOrMetaException(tableId); + result.add(table.getName()); + } + return result; + } + + @Override + public void afterVisible(TransactionState txnState, boolean txnOperated) { + super.afterVisible(txnState, txnOperated); + clearJob(); + } + + @Override + public void afterAborted(TransactionState txnState, boolean txnOperated, String txnStatusChangeReason) + throws UserException { + super.afterAborted(txnState, txnOperated, txnStatusChangeReason); + clearJob(); + } + + @Override + public void cancelJobWithoutCheck(FailMsg failMsg, boolean abortTxn, boolean needLog) { + super.cancelJobWithoutCheck(failMsg, abortTxn, needLog); + clearJob(); + } + + @Override + public void cancelJob(FailMsg failMsg) throws DdlException { + super.cancelJob(failMsg); + clearJob(); + } + + private List getLoadTableIds(List tableNames) throws LoadException { + Database db = Env.getCurrentInternalCatalog() + .getDbOrException(dbId, s -> new LoadException("db does not exist. id: " + s)); + List list = new ArrayList<>(tableNames.size()); + for (String tableName : tableNames) { + OlapTable olapTable = (OlapTable) db.getTableOrException(tableName, + s -> new LoadException("table does not exist. id: " + s)); + list.add(olapTable.getId()); + } + return list; + } + + @Override + protected long getEtlStartTimestamp() { + return etlStartTimestamp; + } + + public long beginTransaction() + throws BeginTransactionException, MetaNotFoundException, AnalysisException, QuotaExceedException, + LabelAlreadyUsedException, DuplicatedRequestException { + this.transactionId = Env.getCurrentGlobalTransactionMgr() + .beginTransaction(dbId, loadTableIds, label, null, + new TransactionState.TxnCoordinator(TransactionState.TxnSourceType.FE, 0, + FrontendOptions.getLocalHostAddress(), ExecuteEnv.getInstance().getStartupTime()), + TransactionState.LoadJobSourceType.FRONTEND, id, getTimeout()); + return transactionId; + } + + public Map getLoadMeta(Map> tableToPartitionMap) + throws LoadException { + + if (tableToPartitionMap == null || tableToPartitionMap.isEmpty()) { + throw new IllegalArgumentException("tableToPartitionMap is empty"); + } + + Database db = Env.getCurrentInternalCatalog() + .getDbOrException(dbId, s -> new LoadException("db does not exist. id: " + s)); + Map loadMeta = new HashMap<>(); + loadMeta.put("dbId", db.getId()); + Long signature = Env.getCurrentEnv().getNextId(); + loadMeta.put("signature", signature); + + List tables; + try { + tables = db.getTablesOnIdOrderOrThrowException(loadTableIds); + } catch (MetaNotFoundException e) { + throw new LoadException(e.getMessage()); + } + + MetaLockUtils.readLockTables(tables); + try { + Map> tableMeta = new HashMap<>(tableToPartitionMap.size()); + for (Map.Entry> entry : tableToPartitionMap.entrySet()) { + String tableName = entry.getKey(); + Map meta = tableMeta.getOrDefault(tableName, new HashMap<>()); + OlapTable olapTable = (OlapTable) db.getTableOrException(tableName, + s -> new LoadException("table does not exist. id: " + s)); + meta.put("id", olapTable.getId()); + List indices = createEtlIndexes(olapTable); + meta.put("indexes", indices); + List partitionNames = entry.getValue(); + Set partitionIds; + if (partitionNames != null && !partitionNames.isEmpty()) { + partitionIds = new HashSet<>(partitionNames.size()); + for (String partitionName : partitionNames) { + Partition partition = olapTable.getPartition(partitionName); + if (partition == null) { + throw new LoadException(String.format("partition %s is not exists", partitionName)); + } + partitionIds.add(partition.getId()); + } + } else { + partitionIds = + olapTable.getAllPartitions().stream().map(Partition::getId).collect(Collectors.toSet()); + } + EtlJobConfig.EtlPartitionInfo etlPartitionInfo = createEtlPartitionInfo(olapTable, partitionIds); + meta.put("partitionInfo", etlPartitionInfo); + tableMeta.put(tableName, meta); + + if (tableToLoadPartitions.containsKey(olapTable.getId())) { + tableToLoadPartitions.get(olapTable.getId()).addAll(partitionIds); + } else { + tableToLoadPartitions.put(olapTable.getId(), partitionIds); + } + + } + loadMeta.put("tableMeta", tableMeta); + } finally { + MetaLockUtils.readUnlockTables(tables); + } + return loadMeta; + + } + + private List createEtlIndexes(OlapTable table) throws LoadException { + List etlIndexes = Lists.newArrayList(); + + for (Map.Entry> entry : table.getIndexIdToSchema().entrySet()) { + long indexId = entry.getKey(); + // todo(liheng): get schema hash and version from materialized index meta directly + MaterializedIndexMeta indexMeta = table.getIndexMetaByIndexId(indexId); + int schemaHash = indexMeta.getSchemaHash(); + int schemaVersion = indexMeta.getSchemaVersion(); + + boolean changeAggType = table.getKeysTypeByIndexId(indexId).equals(KeysType.UNIQUE_KEYS) + && table.getTableProperty().getEnableUniqueKeyMergeOnWrite(); + + // columns + List etlColumns = Lists.newArrayList(); + for (Column column : entry.getValue()) { + etlColumns.add(createEtlColumn(column, changeAggType)); + } + + // check distribution type + DistributionInfo distributionInfo = table.getDefaultDistributionInfo(); + if (distributionInfo.getType() != DistributionInfo.DistributionInfoType.HASH) { + // RANDOM not supported + String errMsg = "Unsupported distribution type. type: " + distributionInfo.getType().name(); + LOG.warn(errMsg); + throw new LoadException(errMsg); + } + + // index type + String indexType; + KeysType keysType = table.getKeysTypeByIndexId(indexId); + switch (keysType) { + case DUP_KEYS: + indexType = "DUPLICATE"; + break; + case AGG_KEYS: + indexType = "AGGREGATE"; + break; + case UNIQUE_KEYS: + indexType = "UNIQUE"; + break; + default: + String errMsg = "unknown keys type. type: " + keysType.name(); + LOG.warn(errMsg); + throw new LoadException(errMsg); + } + + indexToSchemaVersion.put(indexId, schemaVersion); + + etlIndexes.add(new EtlJobConfig.EtlIndex(indexId, etlColumns, schemaHash, indexType, + indexId == table.getBaseIndexId(), schemaVersion)); + } + + return etlIndexes; + } + + private EtlJobConfig.EtlColumn createEtlColumn(Column column, boolean changeAggType) { + // column name + String name = column.getName().toLowerCase(Locale.ROOT); + // column type + PrimitiveType type = column.getDataType(); + String columnType = column.getDataType().toString(); + // is allow null + boolean isAllowNull = column.isAllowNull(); + // is key + boolean isKey = column.isKey(); + + // aggregation type + String aggregationType = null; + if (column.getAggregationType() != null) { + if (changeAggType && !column.isKey()) { + aggregationType = AggregateType.REPLACE.toSql(); + } else { + aggregationType = column.getAggregationType().toString(); + } + } + + // default value + String defaultValue = null; + if (column.getDefaultValue() != null) { + defaultValue = column.getDefaultValue(); + } + if (column.isAllowNull() && column.getDefaultValue() == null) { + defaultValue = "\\N"; + } + + // string length + int stringLength = 0; + if (type.isStringType()) { + stringLength = column.getStrLen(); + } + + // decimal precision scale + int precision = 0; + int scale = 0; + if (type.isDecimalV2Type() || type.isDecimalV3Type()) { + precision = column.getPrecision(); + scale = column.getScale(); + } + + return new EtlJobConfig.EtlColumn(name, columnType, isAllowNull, isKey, aggregationType, defaultValue, + stringLength, precision, scale); + } + + private EtlJobConfig.EtlPartitionInfo createEtlPartitionInfo(OlapTable table, Set partitionIds) + throws LoadException { + PartitionType type = table.getPartitionInfo().getType(); + + List partitionColumnRefs = Lists.newArrayList(); + List etlPartitions = Lists.newArrayList(); + if (type == PartitionType.RANGE) { + RangePartitionInfo rangePartitionInfo = (RangePartitionInfo) table.getPartitionInfo(); + for (Column column : rangePartitionInfo.getPartitionColumns()) { + partitionColumnRefs.add(column.getName()); + } + + for (Map.Entry entry : rangePartitionInfo.getAllPartitionItemEntryList(true)) { + long partitionId = entry.getKey(); + if (!partitionIds.contains(partitionId)) { + continue; + } + + Partition partition = table.getPartition(partitionId); + if (partition == null) { + throw new LoadException("partition does not exist. id: " + partitionId); + } + + // bucket num + int bucketNum = partition.getDistributionInfo().getBucketNum(); + + // is max partition + Range range = entry.getValue().getItems(); + boolean isMaxPartition = range.upperEndpoint().isMaxValue(); + + // start keys + List rangeKeyExprs = range.lowerEndpoint().getKeys(); + List startKeys = Lists.newArrayList(); + for (LiteralExpr literalExpr : rangeKeyExprs) { + Object keyValue = literalExpr.getRealValue(); + startKeys.add(keyValue); + } + + // end keys + // is empty list when max partition + List endKeys = Lists.newArrayList(); + if (!isMaxPartition) { + rangeKeyExprs = range.upperEndpoint().getKeys(); + for (LiteralExpr literalExpr : rangeKeyExprs) { + Object keyValue = literalExpr.getRealValue(); + endKeys.add(keyValue); + } + } + + etlPartitions.add( + new EtlJobConfig.EtlPartition(partitionId, startKeys, endKeys, isMaxPartition, bucketNum)); + } + } else if (type == PartitionType.UNPARTITIONED) { + Preconditions.checkState(partitionIds.size() == 1, "partition size must be eqauls to 1"); + + for (Long partitionId : partitionIds) { + Partition partition = table.getPartition(partitionId); + if (partition == null) { + throw new LoadException("partition does not exist. id: " + partitionId); + } + + // bucket num + int bucketNum = partition.getDistributionInfo().getBucketNum(); + + etlPartitions.add(new EtlJobConfig.EtlPartition(partitionId, Lists.newArrayList(), Lists.newArrayList(), + true, bucketNum)); + } + } else { + throw new LoadException("Spark Load does not support list partition yet"); + } + + // distribution column refs + List distributionColumnRefs = Lists.newArrayList(); + DistributionInfo distributionInfo = table.getDefaultDistributionInfo(); + Preconditions.checkState(distributionInfo.getType() == DistributionInfo.DistributionInfoType.HASH); + for (Column column : ((HashDistributionInfo) distributionInfo).getDistributionColumns()) { + distributionColumnRefs.add(column.getName()); + } + + return new EtlJobConfig.EtlPartitionInfo(type.typeString, partitionColumnRefs, distributionColumnRefs, + etlPartitions); + } + + public void updateEtlStatus() throws Exception { + + if (!checkState(JobState.ETL) || etlStatus == null) { + return; + } + + writeLock(); + try { + switch (etlStatus.getState()) { + case FINISHED: + unprotectedProcessEtlFinish(); + break; + case CANCELLED: + throw new LoadException("spark etl job failed. msg: " + etlStatus.getFailMsg()); + default: + break; + } + } finally { + writeUnlock(); + } + + if (checkState(JobState.LOADING)) { + submitPushTasks(); + } + + } + + private boolean checkState(JobState expectState) { + readLock(); + try { + return state == expectState; + } finally { + readUnlock(); + } + } + + private Set submitPushTasks() throws UserException { + + // check db exist + Database db = null; + try { + db = getDb(); + } catch (MetaNotFoundException e) { + String errMsg = new LogBuilder(LogKey.LOAD_JOB, id).add("database_id", dbId).add("label", label) + .add("error_msg", "db has been deleted when job is loading").build(); + throw new MetaNotFoundException(errMsg); + } + + AgentBatchTask batchTask = new AgentBatchTask(); + boolean hasLoadPartitions = false; + Set totalTablets = Sets.newHashSet(); + List tableList = db.getTablesOnIdOrderOrThrowException( + Lists.newArrayList(tableToLoadPartitions.keySet())); + MetaLockUtils.readLockTables(tableList); + try { + writeLock(); + try { + // check state is still loading. If state is cancelled or finished, return. + // if state is cancelled or finished and not return, + // this would throw all partitions have no load data exception, + // because tableToLoadPartitions was already cleaned up, + if (state != JobState.LOADING) { + LOG.warn("job state is not loading. job id: {}, state: {}", id, state); + return totalTablets; + } + + for (TableIf table : tableList) { + Set partitionIds = tableToLoadPartitions.get(table.getId()); + OlapTable olapTable = (OlapTable) table; + for (long partitionId : partitionIds) { + Partition partition = olapTable.getPartition(partitionId); + if (partition == null) { + throw new LoadException("partition does not exist. id: " + partitionId); + } + + hasLoadPartitions = true; + int quorumReplicaNum = + olapTable.getPartitionInfo().getReplicaAllocation(partitionId).getTotalReplicaNum() / 2 + + 1; + + List indexes = partition.getMaterializedIndices( + MaterializedIndex.IndexExtState.ALL); + for (MaterializedIndex index : indexes) { + long indexId = index.getId(); + MaterializedIndexMeta indexMeta = olapTable.getIndexMetaByIndexId(indexId); + int schemaVersion = indexMeta.getSchemaVersion(); + int schemaHash = indexMeta.getSchemaHash(); + + // check schemaHash and schemaVersion whether is changed + checkIndexSchema(indexId, schemaHash, schemaVersion); + + int bucket = 0; + for (Tablet tablet : index.getTablets()) { + long tabletId = tablet.getId(); + totalTablets.add(tabletId); + Set tabletAllReplicas = Sets.newHashSet(); + Set tabletFinishedReplicas = Sets.newHashSet(); + for (Replica replica : tablet.getReplicas()) { + long replicaId = replica.getId(); + tabletAllReplicas.add(replicaId); + if (!tabletToSentReplicaPushTask.containsKey(tabletId) + || !tabletToSentReplicaPushTask.get(tabletId).containsKey(replicaId)) { + long backendId = replica.getBackendId(); + long taskSignature = Env.getCurrentGlobalTransactionMgr() + .getNextTransactionId(); + + PushTask pushTask = + buildPushTask(backendId, olapTable, taskSignature, partitionId, indexId, + tabletId, replicaId, schemaHash, schemaVersion, bucket++); + if (AgentTaskQueue.addTask(pushTask)) { + batchTask.addTask(pushTask); + if (!tabletToSentReplicaPushTask.containsKey(tabletId)) { + tabletToSentReplicaPushTask.put(tabletId, Maps.newHashMap()); + } + tabletToSentReplicaPushTask.get(tabletId).put(replicaId, pushTask); + } + } + + if (finishedReplicas.contains(replicaId) && replica.getLastFailedVersion() < 0) { + tabletFinishedReplicas.add(replicaId); + } + } + + if (tabletAllReplicas.isEmpty()) { + LOG.error("invalid situation. tablet is empty. id: {}", tabletId); + } + + // check tablet push states + if (tabletFinishedReplicas.size() >= quorumReplicaNum) { + quorumTablets.add(tabletId); + if (tabletFinishedReplicas.size() == tabletAllReplicas.size()) { + fullTablets.add(tabletId); + } + } + } + } + } + } + + if (batchTask.getTaskNum() > 0) { + AgentTaskExecutor.submit(batchTask); + } + + if (!hasLoadPartitions) { + String errMsg = new LogBuilder(LogKey.LOAD_JOB, id).add("database_id", dbId).add("label", label) + .add("error_msg", "all partitions have no load data").build(); + throw new LoadException(errMsg); + } + + return totalTablets; + } finally { + writeUnlock(); + } + } finally { + MetaLockUtils.readUnlockTables(tableList); + } + + } + + public void updateJobStatus(Map statusInfo) { + + updateState(statusInfo.get("status"), statusInfo.get("msg")); + + etlStatus.setTrackingUrl(statusInfo.get("appId")); + etlStatus.setProgress(progress); + + if (etlStatus.getState() == TEtlState.FINISHED) { + Gson gson = new Gson(); + DppResult dppResult = gson.fromJson(statusInfo.get("dppResult"), DppResult.class); + loadStatistic.fileNum = (int) dppResult.fileNumber; + loadStatistic.totalFileSizeB = dppResult.fileSize; + TUniqueId dummyId = new TUniqueId(0, 0); + long dummyBackendId = -1L; + loadStatistic.initLoad(dummyId, Sets.newHashSet(dummyId), Lists.newArrayList(dummyBackendId)); + loadStatistic.updateLoadProgress(dummyBackendId, dummyId, dummyId, dppResult.scannedRows, + dppResult.scannedBytes, true); + loadingStatus.setDppResult(dppResult); + Map counters = loadingStatus.getCounters(); + counters.put(DPP_NORMAL_ALL, String.valueOf(dppResult.normalRows)); + counters.put(DPP_ABNORMAL_ALL, String.valueOf(dppResult.abnormalRows)); + counters.put(UNSELECTED_ROWS, String.valueOf(dppResult.unselectRows)); + filePathToSize.putAll( + gson.fromJson(statusInfo.get("filePathToSize"), new TypeToken>() { + })); + hadoopProperties.putAll( + gson.fromJson(statusInfo.get("hadoopProperties"), new TypeToken>() { + })); + } + + } + + private void updateState(String stateStr, String msg) { + + switch (stateStr.toLowerCase()) { + case "running": + etlStatus.setState(TEtlState.RUNNING); + break; + case "success": + etlStatus.setState(TEtlState.FINISHED); + break; + case "failed": + boolean res = etlStatus.setState(TEtlState.CANCELLED); + if (!res) { + etlStatus = new EtlStatus(); + etlStatus.setState(TEtlState.CANCELLED); + } + etlStatus.setFailMsg(msg); + break; + default: + etlStatus.setState(TEtlState.UNKNOWN); + break; + } + + } + + public void startEtlJob() { + etlStartTimestamp = System.currentTimeMillis(); + state = JobState.ETL; + etlStatus = new EtlStatus(); + unprotectedLogUpdateStateInfo(); + } + + private void unprotectedUpdateToLoadingState(EtlStatus etlStatus, Map filePathToSize) + throws LoadException { + try { + for (Map.Entry entry : filePathToSize.entrySet()) { + String filePath = entry.getKey(); + if (!filePath.endsWith(EtlJobConfig.ETL_OUTPUT_FILE_FORMAT)) { + continue; + } + String tabletMetaStr = EtlJobConfig.getTabletMetaStr(filePath); + tabletMetaToFileInfo.put(tabletMetaStr, Pair.of(filePath, entry.getValue())); + } + + loadingStatus = etlStatus; + progress = 0; + Env.getCurrentProgressManager().registerProgressSimple(String.valueOf(id)); + unprotectedUpdateState(JobState.LOADING); + LOG.info("update to {} state success. job id: {}", state, id); + } catch (Exception e) { + LOG.warn("update to {} state failed. job id: {}", state, id, e); + throw new LoadException(e.getMessage(), e); + } + } + + private void unprotectedPrepareLoadingInfos() { + for (String tabletMetaStr : tabletMetaToFileInfo.keySet()) { + String[] fileNameArr = tabletMetaStr.split("\\."); + // tableId.partitionId.indexId.bucket.schemaHash + Preconditions.checkState(fileNameArr.length == 5); + long tableId = Long.parseLong(fileNameArr[0]); + long partitionId = Long.parseLong(fileNameArr[1]); + long indexId = Long.parseLong(fileNameArr[2]); + int schemaHash = Integer.parseInt(fileNameArr[4]); + + if (!tableToLoadPartitions.containsKey(tableId)) { + tableToLoadPartitions.put(tableId, Sets.newHashSet()); + } + tableToLoadPartitions.get(tableId).add(partitionId); + + indexToSchemaHash.put(indexId, schemaHash); + } + } + + private void unprotectedProcessEtlFinish() throws Exception { + // checkDataQuality + if (!checkDataQuality()) { + throw new DataQualityException(DataQualityException.QUALITY_FAIL_MSG); + } + + // get etl output files and update loading state + unprotectedUpdateToLoadingState(etlStatus, filePathToSize); + // log loading state + unprotectedLogUpdateStateInfo(); + // prepare loading infos + unprotectedPrepareLoadingInfos(); + } + + private TBrokerScanRange getTBrokerScanRange(DescriptorTable descTable, TupleDescriptor destTupleDesc, + List columns, Map properties) + throws AnalysisException { + + TBrokerScanRange brokerScanRange = new TBrokerScanRange(); + + TBrokerScanRangeParams params = new TBrokerScanRangeParams(); + params.setStrictMode(false); + params.setProperties(properties); + TupleDescriptor srcTupleDesc = descTable.createTupleDescriptor(); + Map srcSlotDescByName = Maps.newHashMap(); + for (Column column : columns) { + SlotDescriptor srcSlotDesc = descTable.addSlotDescriptor(srcTupleDesc); + srcSlotDesc.setIsMaterialized(true); + srcSlotDesc.setIsNullable(true); + + if (column.getDataType() == PrimitiveType.BITMAP) { + // cast to bitmap when the target column type is bitmap + srcSlotDesc.setType(ScalarType.createType(PrimitiveType.BITMAP)); + srcSlotDesc.setColumn(new Column(column.getName(), PrimitiveType.BITMAP)); + } else { + srcSlotDesc.setType(ScalarType.createType(PrimitiveType.VARCHAR)); + srcSlotDesc.setColumn(new Column(column.getName(), PrimitiveType.VARCHAR)); + } + + params.addToSrcSlotIds(srcSlotDesc.getId().asInt()); + srcSlotDescByName.put(column.getName(), srcSlotDesc); + } + + Map destSidToSrcSidWithoutTrans = Maps.newHashMap(); + for (SlotDescriptor destSlotDesc : destTupleDesc.getSlots()) { + if (!destSlotDesc.isMaterialized()) { + continue; + } + + SlotDescriptor srcSlotDesc = srcSlotDescByName.get(destSlotDesc.getColumn().getName()); + destSidToSrcSidWithoutTrans.put(destSlotDesc.getId().asInt(), srcSlotDesc.getId().asInt()); + Expr expr = new SlotRef(srcSlotDesc); + expr = castToSlot(destSlotDesc, expr); + params.putToExprOfDestSlot(destSlotDesc.getId().asInt(), expr.treeToThrift()); + } + params.setDestSidToSrcSidWithoutTrans(destSidToSrcSidWithoutTrans); + params.setSrcTupleId(srcTupleDesc.getId().asInt()); + params.setDestTupleId(destTupleDesc.getId().asInt()); + brokerScanRange.setParams(params); + + // broker address updated for each replica + brokerScanRange.setBrokerAddresses(Lists.newArrayList()); + + // broker range desc + TBrokerRangeDesc tBrokerRangeDesc = new TBrokerRangeDesc(); + tBrokerRangeDesc.setFileType(TFileType.FILE_HDFS); + tBrokerRangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); + tBrokerRangeDesc.setSplittable(false); + tBrokerRangeDesc.setStartOffset(0); + tBrokerRangeDesc.setSize(-1); + // path and file size updated for each replica + brokerScanRange.setRanges(Collections.singletonList(tBrokerRangeDesc)); + + return brokerScanRange; + + } + + private Expr castToSlot(SlotDescriptor slotDesc, Expr expr) throws AnalysisException { + PrimitiveType dstType = slotDesc.getType().getPrimitiveType(); + PrimitiveType srcType = expr.getType().getPrimitiveType(); + if (dstType == PrimitiveType.BOOLEAN && srcType == PrimitiveType.VARCHAR) { + // there is no cast VARCHAR to BOOLEAN function, + // so we cast VARCHAR to TINYINT first, then cast TINYINT to BOOLEAN + return new CastExpr(Type.BOOLEAN, new CastExpr(Type.TINYINT, expr)); + } + if (dstType != srcType) { + return expr.castTo(slotDesc.getType()); + } + return expr; + } + + private TDescriptorTable getTDescriptorTable(DescriptorTable descTable) { + descTable.computeStatAndMemLayout(); + return descTable.toThrift(); + } + + private PushTask buildPushTask(long backendId, OlapTable olapTable, long taskSignature, long partitionId, + long indexId, long tabletId, long replicaId, int schemaHash, int schemaVersion, + long bucket) + throws AnalysisException { + + DescriptorTable descTable = new DescriptorTable(); + TupleDescriptor destTupleDesc = descTable.createTupleDescriptor(); + + List columnsDesc = new ArrayList<>(); + List columns = new ArrayList<>(); + for (Column column : olapTable.getSchemaByIndexId(indexId)) { + Column col = new Column(column); + col.setName(column.getName().toLowerCase(Locale.ROOT)); + columns.add(col); + columnsDesc.add(col.toThrift()); + // use index schema to fill the descriptor table + SlotDescriptor destSlotDesc = descTable.addSlotDescriptor(destTupleDesc); + destSlotDesc.setIsMaterialized(true); + destSlotDesc.setColumn(col); + destSlotDesc.setIsNullable(col.isAllowNull()); + } + + // deep copy TBrokerScanRange because filePath and fileSize will be updated + // in different tablet push task + TBrokerScanRange tBrokerScanRange = + getTBrokerScanRange(descTable, destTupleDesc, columns, hadoopProperties); + // update filePath fileSize + TBrokerRangeDesc tBrokerRangeDesc = tBrokerScanRange.getRanges().get(0); + tBrokerRangeDesc.setFileType(TFileType.FILE_HDFS); + tBrokerRangeDesc.setPath(""); + tBrokerRangeDesc.setFileSize(-1); + String tabletMetaStr = String.format("%d.%d.%d.%d.%d", olapTable.getId(), partitionId, + indexId, bucket, schemaHash); + if (tabletMetaToFileInfo.containsKey(tabletMetaStr)) { + Pair fileInfo = tabletMetaToFileInfo.get(tabletMetaStr); + tBrokerRangeDesc.setPath(fileInfo.first); + tBrokerRangeDesc.setFileSize(fileInfo.second); + } + + TDescriptorTable tDescriptorTable = getTDescriptorTable(descTable); + + return new PushTask(backendId, dbId, olapTable.getId(), + partitionId, indexId, tabletId, replicaId, schemaHash, 0, id, + TPushType.LOAD_V2, TPriority.NORMAL, transactionId, taskSignature, + tBrokerScanRange, tDescriptorTable, columnsDesc, + olapTable.getStorageVaultId(), schemaVersion); + } + + public void updateLoadingStatus() throws UserException { + if (!checkState(JobState.LOADING)) { + return; + } + + if (etlStatus.getState() == TEtlState.CANCELLED) { + throw new LoadException(etlStatus.getFailMsg()); + } + + // submit push tasks + Set totalTablets = submitPushTasks(); + if (totalTablets.isEmpty()) { + LOG.warn("total tablets set is empty. job id: {}, state: {}", id, state); + return; + } + + // update status + boolean canCommitJob = false; + writeLock(); + try { + // loading progress + // 100: txn status is visible and load has been finished + progress = fullTablets.size() * 100 / totalTablets.size(); + if (progress == 100) { + progress = 99; + } + + // quorum finish ts + if (quorumFinishTimestamp < 0 && quorumTablets.containsAll(totalTablets)) { + quorumFinishTimestamp = System.currentTimeMillis(); + } + + // if all replicas are finished or stay in quorum finished for long time, try to commit it. + long stragglerTimeout = 300 * 1000; + if ((quorumFinishTimestamp > 0 && System.currentTimeMillis() - quorumFinishTimestamp > stragglerTimeout) + || fullTablets.containsAll(totalTablets)) { + canCommitJob = true; + } + } finally { + writeUnlock(); + } + + // try commit transaction + if (canCommitJob) { + tryCommitJob(); + } + } + + private void tryCommitJob() throws UserException { + LOG.info(new LogBuilder(LogKey.LOAD_JOB, id).add("txn_id", transactionId) + .add("msg", "Load job try to commit txn").build()); + Database db = getDb(); + List
tableList = db.getTablesOnIdOrderOrThrowException( + Lists.newArrayList(tableToLoadPartitions.keySet())); + MetaLockUtils.writeLockTablesOrMetaException(tableList); + try { + Env.getCurrentGlobalTransactionMgr().commitTransaction( + dbId, tableList, transactionId, commitInfos, + new LoadJobFinalOperation(id, loadingStatus, progress, loadStartTimestamp, + finishTimestamp, state, failMsg)); + } catch (TabletQuorumFailedException e) { + // retry in next loop + } finally { + MetaLockUtils.writeUnlockTables(tableList); + } + } + + public void addFinishedReplica(long replicaId, long tabletId, long backendId) { + writeLock(); + try { + if (finishedReplicas.add(replicaId)) { + commitInfos.add(new TabletCommitInfo(tabletId, backendId)); + // set replica push task null + Map sentReplicaPushTask = tabletToSentReplicaPushTask.get(tabletId); + if (sentReplicaPushTask != null) { + if (sentReplicaPushTask.containsKey(replicaId)) { + sentReplicaPushTask.put(replicaId, null); + } + } + } + } finally { + writeUnlock(); + } + } + + private void clearJob() { + Preconditions.checkState(state == JobState.FINISHED || state == JobState.CANCELLED); + + if (LOG.isDebugEnabled()) { + LOG.debug("clear push tasks and infos that not persist. id: {}, state: {}", id, state); + } + writeLock(); + try { + // clear push task first + for (Map sentReplicaPushTask : tabletToSentReplicaPushTask.values()) { + for (PushTask pushTask : sentReplicaPushTask.values()) { + if (pushTask == null) { + continue; + } + AgentTaskQueue.removeTask(pushTask.getBackendId(), pushTask.getTaskType(), pushTask.getSignature()); + } + } + tableToLoadPartitions.clear(); + indexToSchemaHash.clear(); + tabletToSentReplicaPushTask.clear(); + finishedReplicas.clear(); + quorumTablets.clear(); + fullTablets.clear(); + + Env.getCurrentProgressManager().removeProgress(String.valueOf(progress)); + } finally { + writeUnlock(); + } + } + + private void unprotectedLogUpdateStateInfo() { + IngestionLoadJobStateUpdateInfo info = + new IngestionLoadJobStateUpdateInfo(id, state, transactionId, etlStartTimestamp, loadStartTimestamp, + etlStatus, tabletMetaToFileInfo, hadoopProperties, indexToSchemaVersion); + Env.getCurrentEnv().getEditLog().logUpdateLoadJob(info); + } + + public static class IngestionLoadJobStateUpdateInfo extends LoadJobStateUpdateInfo { + + @SerializedName(value = "etlStartTimestamp") + private long etlStartTimestamp; + @SerializedName(value = "etlStatus") + private EtlStatus etlStatus; + @SerializedName(value = "tabletMetaToFileInfo") + private Map> tabletMetaToFileInfo; + @SerializedName(value = "hadoopProperties") + private Map hadoopProperties; + @SerializedName(value = "indexToSchemaVersion") + private Map indexToSchemaVersion; + + public IngestionLoadJobStateUpdateInfo(long jobId, JobState state, long transactionId, + long etlStartTimestamp, long loadStartTimestamp, EtlStatus etlStatus, + Map> tabletMetaToFileInfo, + Map hadoopProperties, + Map indexToSchemaVersion) { + super(jobId, state, transactionId, loadStartTimestamp); + this.etlStartTimestamp = etlStartTimestamp; + this.etlStatus = etlStatus; + this.tabletMetaToFileInfo = tabletMetaToFileInfo; + this.hadoopProperties = hadoopProperties; + this.indexToSchemaVersion = indexToSchemaVersion; + } + + public long getEtlStartTimestamp() { + return etlStartTimestamp; + } + + public EtlStatus getEtlStatus() { + return etlStatus; + } + + public Map> getTabletMetaToFileInfo() { + return tabletMetaToFileInfo; + } + + public Map getHadoopProperties() { + return hadoopProperties; + } + + public Map getIndexToSchemaVersion() { + return indexToSchemaVersion; + } + } + + @Override + public void replayUpdateStateInfo(LoadJobStateUpdateInfo info) { + super.replayUpdateStateInfo(info); + IngestionLoadJobStateUpdateInfo stateUpdateInfo = (IngestionLoadJobStateUpdateInfo) info; + this.etlStartTimestamp = stateUpdateInfo.getEtlStartTimestamp(); + this.etlStatus = stateUpdateInfo.getEtlStatus(); + if (stateUpdateInfo.getTabletMetaToFileInfo() != null) { + this.tabletMetaToFileInfo.putAll(stateUpdateInfo.getTabletMetaToFileInfo()); + } + if (stateUpdateInfo.getHadoopProperties() != null) { + this.hadoopProperties.putAll(stateUpdateInfo.getHadoopProperties()); + } + if (stateUpdateInfo.getIndexToSchemaVersion() != null) { + this.indexToSchemaVersion.putAll(stateUpdateInfo.getIndexToSchemaVersion()); + } + switch (state) { + case ETL: + break; + case LOADING: + unprotectedPrepareLoadingInfos(); + break; + default: + LOG.warn("replay update load job state info failed. error: wrong state. job id: {}, state: {}", id, + state); + break; + } + } + + @Override + protected void readFields(DataInput in) throws IOException { + super.readFields(in); + this.etlStartTimestamp = in.readLong(); + this.etlStatus = new EtlStatus(); + this.etlStatus.readFields(in); + int size = in.readInt(); + for (int i = 0; i < size; i++) { + String tabletMetaStr = Text.readString(in); + Pair fileInfo = Pair.of(Text.readString(in), in.readLong()); + tabletMetaToFileInfo.put(tabletMetaStr, fileInfo); + } + size = in.readInt(); + for (int i = 0; i < size; i++) { + String propKey = Text.readString(in); + String propValue = Text.readString(in); + hadoopProperties.put(propKey, propValue); + } + size = in.readInt(); + for (int i = 0; i < size; i++) { + indexToSchemaVersion.put(in.readLong(), in.readInt()); + } + } + + private void checkIndexSchema(long indexId, int schemaHash, int schemaVersion) throws LoadException { + if (indexToSchemaHash.containsKey(indexId) && indexToSchemaHash.get(indexId) == schemaHash + && indexToSchemaVersion.containsKey(indexId) && indexToSchemaVersion.get(indexId) == schemaVersion) { + return; + } + throw new LoadException( + "schema of index [" + indexId + "] has changed, old schemaHash: " + indexToSchemaHash.get(indexId) + + ", current schemaHash: " + schemaHash + ", old schemaVersion: " + + indexToSchemaVersion.get(indexId) + ", current schemaVersion: " + schemaVersion); + } + +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadJob.java index 8cb9070cf7a503..615196f3e370da 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadJob.java @@ -339,6 +339,9 @@ private void initDefaultJobProperties() { case MINI: timeout = Config.stream_load_default_timeout_second; break; + case INGESTION: + timeout = Config.ingestion_load_default_timeout_second; + break; default: break; } @@ -867,6 +870,8 @@ public static LoadJob read(DataInput in) throws IOException { job = new MiniLoadJob(); } else if (type == EtlJobType.COPY) { job = new CopyJob(); + } else if (type == EtlJobType.INGESTION) { + job = new IngestionLoadJob(); } else { throw new IOException("Unknown load type: " + type.name()); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadManager.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadManager.java index 07c459d61cf056..5992dc445279f0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadManager.java @@ -32,6 +32,7 @@ import org.apache.doris.common.DataQualityException; import org.apache.doris.common.DdlException; import org.apache.doris.common.LabelAlreadyUsedException; +import org.apache.doris.common.LoadException; import org.apache.doris.common.MetaNotFoundException; import org.apache.doris.common.Pair; import org.apache.doris.common.PatternMatcher; @@ -493,10 +494,16 @@ private void removeLoadJobIf(Predicate pred) { * Only for those jobs which have etl state, like SparkLoadJob. **/ public void processEtlStateJobs() { - idToLoadJob.values().stream().filter(job -> (job.jobType == EtlJobType.SPARK && job.state == JobState.ETL)) + idToLoadJob.values().stream() + .filter(job -> ((job.jobType == EtlJobType.SPARK || job.jobType == EtlJobType.INGESTION) + && job.state == JobState.ETL)) .forEach(job -> { try { - ((SparkLoadJob) job).updateEtlStatus(); + if (job instanceof SparkLoadJob) { + ((SparkLoadJob) job).updateEtlStatus(); + } else if (job instanceof IngestionLoadJob) { + ((IngestionLoadJob) job).updateEtlStatus(); + } } catch (DataQualityException e) { LOG.info("update load job etl status failed. job id: {}", job.getId(), e); job.cancelJobWithoutCheck(new FailMsg(FailMsg.CancelType.ETL_QUALITY_UNSATISFIED, @@ -514,10 +521,16 @@ public void processEtlStateJobs() { * Only for those jobs which load by PushTask. **/ public void processLoadingStateJobs() { - idToLoadJob.values().stream().filter(job -> (job.jobType == EtlJobType.SPARK && job.state == JobState.LOADING)) + idToLoadJob.values().stream() + .filter(job -> ((job.jobType == EtlJobType.SPARK || job.jobType == EtlJobType.INGESTION) + && job.state == JobState.LOADING)) .forEach(job -> { try { - ((SparkLoadJob) job).updateLoadingStatus(); + if (job instanceof SparkLoadJob) { + ((SparkLoadJob) job).updateLoadingStatus(); + } else if (job instanceof IngestionLoadJob) { + ((IngestionLoadJob) job).updateLoadingStatus(); + } } catch (UserException e) { LOG.warn("update load job loading status failed. job id: {}", job.getId(), e); job.cancelJobWithoutCheck(new FailMsg(CancelType.LOAD_RUN_FAIL, e.getMessage()), true, true); @@ -568,8 +581,8 @@ public List> getCreateLoadStmt(long dbId, String label) throw * @param accurateMatch true: filter jobs which's label is labelValue. false: filter jobs which's label like itself. * @param statesValue used to filter jobs which's state within the statesValue set. * @return The result is the list of jobInfo. - * JobInfo is a list which includes the comparable object: jobId, label, state etc. - * The result is unordered. + * JobInfo is a list which includes the comparable object: jobId, label, state etc. + * The result is unordered. */ public List> getLoadJobInfosByDb(long dbId, String labelValue, boolean accurateMatch, Set statesValue) throws AnalysisException { @@ -987,4 +1000,29 @@ public long createLoadJobFromStmt(InsertStmt insertStmt) throws DdlException { loadJobScheduler.submitJob(loadJob); return loadJob.getId(); } + + public long createIngestionLoadJob(String dbName, String label, List tableNames, + Map properties, + UserIdentity userInfo) + throws DdlException, LoadException { + Database db = checkDb(dbName); + long dbId = db.getId(); + LoadJob loadJob; + writeLock(); + try { + checkLabelUsed(dbId, label); + if (unprotectedGetUnfinishedJobNum() >= Config.desired_max_waiting_jobs) { + throw new DdlException("There are more than " + Config.desired_max_waiting_jobs + + " unfinished load jobs, please retry later. You can use `SHOW LOAD` to view submitted jobs"); + } + loadJob = new IngestionLoadJob(dbId, label, tableNames, userInfo); + loadJob.setJobProperties(properties); + createLoadJob(loadJob); + } finally { + writeUnlock(); + } + Env.getCurrentEnv().getEditLog().logCreateLoadJob(loadJob); + return loadJob.getId(); + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkEtlJobHandler.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkEtlJobHandler.java index 69a41bd12836d0..f0533bb80cd121 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkEtlJobHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkEtlJobHandler.java @@ -60,6 +60,7 @@ * 4. get spark etl file paths * 5. delete etl output path */ +@Deprecated public class SparkEtlJobHandler { private static final Logger LOG = LogManager.getLogger(SparkEtlJobHandler.class); diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLauncherMonitor.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLauncherMonitor.java index 4b919cd993821c..68d6c571536c9e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLauncherMonitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLauncherMonitor.java @@ -38,6 +38,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +@Deprecated public class SparkLauncherMonitor { private static final Logger LOG = LogManager.getLogger(SparkLauncherMonitor.class); diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadAppHandle.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadAppHandle.java index a6327ff02a934d..60e82d76557d6a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadAppHandle.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadAppHandle.java @@ -33,6 +33,7 @@ import java.util.Iterator; import java.util.List; +@Deprecated public class SparkLoadAppHandle implements Writable { private static final Logger LOG = LogManager.getLogger(SparkLoadAppHandle.class); // 5min diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadJob.java index f01f205e96dc0d..25afaeae22df80 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadJob.java @@ -118,6 +118,7 @@ * Step3: LoadLoadingChecker will check loading status periodically and commit transaction when push tasks are finished. * Step4: PublishVersionDaemon will send publish version tasks to be and finish transaction. */ +@Deprecated public class SparkLoadJob extends BulkLoadJob { private static final Logger LOG = LogManager.getLogger(SparkLoadJob.class); diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadPendingTask.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadPendingTask.java index 32749fd8a774f0..797d1c7ed9f2f2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadPendingTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkLoadPendingTask.java @@ -31,6 +31,7 @@ import org.apache.doris.catalog.HashDistributionInfo; import org.apache.doris.catalog.HiveTable; import org.apache.doris.catalog.KeysType; +import org.apache.doris.catalog.MaterializedIndexMeta; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.PartitionItem; @@ -77,6 +78,7 @@ // 1. create etl job config and write it into jobconfig.json file // 2. submit spark etl job +@Deprecated public class SparkLoadPendingTask extends LoadTask { private static final Logger LOG = LogManager.getLogger(SparkLoadPendingTask.class); @@ -245,7 +247,9 @@ private List createEtlIndexes(OlapTable table) throws LoadException { for (Map.Entry> entry : table.getIndexIdToSchema().entrySet()) { long indexId = entry.getKey(); - int schemaHash = table.getSchemaHashByIndexId(indexId); + MaterializedIndexMeta indexMeta = table.getIndexMetaByIndexId(indexId); + int schemaHash = indexMeta.getSchemaHash(); + int schemaVersion = indexMeta.getSchemaVersion(); boolean changeAggType = table.getKeysTypeByIndexId(indexId).equals(KeysType.UNIQUE_KEYS) && table.getTableProperty().getEnableUniqueKeyMergeOnWrite(); @@ -287,7 +291,7 @@ private List createEtlIndexes(OlapTable table) throws LoadException { // is base index boolean isBaseIndex = indexId == table.getBaseIndexId() ? true : false; - etlIndexes.add(new EtlIndex(indexId, etlColumns, schemaHash, indexType, isBaseIndex)); + etlIndexes.add(new EtlIndex(indexId, etlColumns, schemaHash, indexType, isBaseIndex, schemaVersion)); } return etlIndexes; diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkPendingTaskAttachment.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkPendingTaskAttachment.java index a2bbb058e934c6..315f1ae0cd80a8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkPendingTaskAttachment.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkPendingTaskAttachment.java @@ -17,6 +17,7 @@ package org.apache.doris.load.loadv2; +@Deprecated public class SparkPendingTaskAttachment extends TaskAttachment { private SparkLoadAppHandle handle; private String appId; diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkRepository.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkRepository.java index 19b21ff11fe25b..54279250bf5944 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkRepository.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkRepository.java @@ -54,6 +54,7 @@ * * __archive_3_2_0/ * * ... */ +@Deprecated public class SparkRepository { private static final Logger LOG = LogManager.getLogger(SparkRepository.class); diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkYarnConfigFiles.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkYarnConfigFiles.java index 88038d081b2ccf..7f5894804dcc20 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkYarnConfigFiles.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/SparkYarnConfigFiles.java @@ -42,6 +42,7 @@ import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +@Deprecated public class SparkYarnConfigFiles { private static final Logger LOG = LogManager.getLogger(SparkYarnConfigFiles.class); diff --git a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java index 4010a9b564d0a0..bed46473d0d94d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java +++ b/fe/fe-core/src/main/java/org/apache/doris/master/MasterImpl.java @@ -34,6 +34,7 @@ import org.apache.doris.common.Config; import org.apache.doris.common.MetaNotFoundException; import org.apache.doris.load.DeleteJob; +import org.apache.doris.load.loadv2.IngestionLoadJob; import org.apache.doris.load.loadv2.SparkLoadJob; import org.apache.doris.system.Backend; import org.apache.doris.task.AgentTask; @@ -429,7 +430,11 @@ private void finishRealtimePush(AgentTask task, TFinishTaskRequest request) thro olapTable, partition, backendId, tabletId, tabletMeta.getIndexId()); // if the replica is under schema change, could not find the replica with aim schema hash if (replica != null) { - ((SparkLoadJob) job).addFinishedReplica(replica.getId(), pushTabletId, backendId); + if (job instanceof SparkLoadJob) { + ((SparkLoadJob) job).addFinishedReplica(replica.getId(), pushTabletId, backendId); + } else if (job instanceof IngestionLoadJob) { + ((IngestionLoadJob) job).addFinishedReplica(replica.getId(), pushTabletId, backendId); + } } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/gson/GsonUtils.java b/fe/fe-core/src/main/java/org/apache/doris/persist/gson/GsonUtils.java index ee6f2f74eac43d..6132bbe64eb07d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/gson/GsonUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/gson/GsonUtils.java @@ -182,6 +182,8 @@ import org.apache.doris.job.extensions.mtmv.MTMVJob; import org.apache.doris.load.loadv2.BrokerLoadJob; import org.apache.doris.load.loadv2.BulkLoadJob; +import org.apache.doris.load.loadv2.IngestionLoadJob; +import org.apache.doris.load.loadv2.IngestionLoadJob.IngestionLoadJobStateUpdateInfo; import org.apache.doris.load.loadv2.InsertLoadJob; import org.apache.doris.load.loadv2.LoadJob; import org.apache.doris.load.loadv2.LoadJob.LoadJobStateUpdateInfo; @@ -385,7 +387,9 @@ public class GsonUtils { // runtime adapter for class "LoadJobStateUpdateInfo" private static RuntimeTypeAdapterFactory loadJobStateUpdateInfoTypeAdapterFactory = RuntimeTypeAdapterFactory.of(LoadJobStateUpdateInfo.class, "clazz") - .registerSubtype(SparkLoadJobStateUpdateInfo.class, SparkLoadJobStateUpdateInfo.class.getSimpleName()); + .registerSubtype(SparkLoadJobStateUpdateInfo.class, SparkLoadJobStateUpdateInfo.class.getSimpleName()) + .registerSubtype(IngestionLoadJobStateUpdateInfo.class, + IngestionLoadJobStateUpdateInfo.class.getSimpleName()); // runtime adapter for class "Policy" private static RuntimeTypeAdapterFactory policyTypeAdapterFactory = RuntimeTypeAdapterFactory.of( @@ -582,7 +586,8 @@ public class GsonUtils { .registerSubtype(CopyJob.class, CopyJob.class.getSimpleName()) .registerSubtype(InsertLoadJob.class, InsertLoadJob.class.getSimpleName()) .registerSubtype(MiniLoadJob.class, MiniLoadJob.class.getSimpleName()) - .registerSubtype(SparkLoadJob.class, SparkLoadJob.class.getSimpleName()); + .registerSubtype(SparkLoadJob.class, SparkLoadJob.class.getSimpleName()) + .registerSubtype(IngestionLoadJob.class, IngestionLoadJob.class.getSimpleName()); private static RuntimeTypeAdapterFactory partitionItemTypeAdapterFactory = RuntimeTypeAdapterFactory.of(PartitionItem.class, "clazz") diff --git a/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/SparkLoadPendingTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/SparkLoadPendingTaskTest.java index f02e29271deb06..c924ca472a6579 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/SparkLoadPendingTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/SparkLoadPendingTaskTest.java @@ -29,6 +29,7 @@ import org.apache.doris.catalog.Env; import org.apache.doris.catalog.HashDistributionInfo; import org.apache.doris.catalog.KeysType; +import org.apache.doris.catalog.MaterializedIndexMeta; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.PartitionInfo; @@ -50,6 +51,7 @@ import org.apache.doris.sparkdpp.EtlJobConfig.EtlPartition; import org.apache.doris.sparkdpp.EtlJobConfig.EtlPartitionInfo; import org.apache.doris.sparkdpp.EtlJobConfig.EtlTable; +import org.apache.doris.thrift.TStorageType; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -87,6 +89,9 @@ public void testExecuteTask(@Injectable SparkLoadJob sparkLoadJob, // partition and distribution infos long partitionId = 2L; DistributionInfo distributionInfo = new HashDistributionInfo(2, Lists.newArrayList(columns.get(0))); + MaterializedIndexMeta indexMeta = + new MaterializedIndexMeta(indexId, columns, 0, 123, (short) 1, TStorageType.COLUMN, KeysType.DUP_KEYS, + null, null, null); PartitionInfo partitionInfo = new SinglePartitionInfo(); Partition partition = new Partition(partitionId, "p1", null, distributionInfo); List partitions = Lists.newArrayList(partition); @@ -111,8 +116,8 @@ public void testExecuteTask(@Injectable SparkLoadJob sparkLoadJob, result = indexIdToSchema; table.getDefaultDistributionInfo(); result = distributionInfo; - table.getSchemaHashByIndexId(indexId); - result = 123; + table.getIndexMetaByIndexId(indexId); + result = indexMeta; table.getPartitionInfo(); result = partitionInfo; table.getPartition(partitionId); @@ -169,6 +174,12 @@ public void testRangePartitionHashDistribution(@Injectable SparkLoadJob sparkLoa long partition2Id = 5L; int distributionColumnIndex = 1; DistributionInfo distributionInfo = new HashDistributionInfo(3, Lists.newArrayList(columns.get(distributionColumnIndex))); + MaterializedIndexMeta indexMeta1 = + new MaterializedIndexMeta(index1Id, columns, 0, 123, (short) 1, TStorageType.COLUMN, KeysType.DUP_KEYS, + null, null, null); + MaterializedIndexMeta indexMeta2 = + new MaterializedIndexMeta(index2Id, columns, 0, 234, (short) 1, TStorageType.COLUMN, KeysType.DUP_KEYS, + null, null, null); Partition partition1 = new Partition(partition1Id, "p1", null, distributionInfo); Partition partition2 = new Partition(partition2Id, "p2", null, @@ -203,10 +214,10 @@ public void testRangePartitionHashDistribution(@Injectable SparkLoadJob sparkLoa result = indexIdToSchema; table.getDefaultDistributionInfo(); result = distributionInfo; - table.getSchemaHashByIndexId(index1Id); - result = 123; - table.getSchemaHashByIndexId(index2Id); - result = 234; + table.getIndexMetaByIndexId(index1Id); + result = indexMeta1; + table.getIndexMetaByIndexId(index2Id); + result = indexMeta2; table.getPartitionInfo(); result = partitionInfo; table.getPartition(partition1Id); @@ -222,7 +233,8 @@ public void testRangePartitionHashDistribution(@Injectable SparkLoadJob sparkLoa } }; - SparkLoadPendingTask task = new SparkLoadPendingTask(sparkLoadJob, aggKeyToFileGroups, resource, brokerDesc, LoadTask.Priority.NORMAL); + SparkLoadPendingTask task = new SparkLoadPendingTask(sparkLoadJob, aggKeyToFileGroups, resource, brokerDesc, + LoadTask.Priority.NORMAL); EtlJobConfig etlJobConfig = Deencapsulation.getField(task, "etlJobConfig"); Assert.assertEquals(null, etlJobConfig); task.init(); diff --git a/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java b/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java index 90c95cf04fabb9..7c0cd5a542b5ae 100644 --- a/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java +++ b/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/dpp/MinimumCoverageRollupTreeBuilderTest.java @@ -55,24 +55,24 @@ public void testBuild() { baseColumns.add(column3); baseColumns.add(column4); EtlJobConfig.EtlIndex baseIndex = new EtlJobConfig.EtlIndex(10000, - baseColumns, 12345, "DUPLICATE", true); + baseColumns, 12345, "DUPLICATE", true, 1); List roll1Columns = new ArrayList<>(); roll1Columns.add(column1); roll1Columns.add(column2); roll1Columns.add(column4); EtlJobConfig.EtlIndex roll1Index = new EtlJobConfig.EtlIndex(10001, - roll1Columns, 12346, "AGGREGATE", false); + roll1Columns, 12346, "AGGREGATE", false, 1); List roll2Columns = new ArrayList<>(); roll2Columns.add(column1); roll2Columns.add(column4); EtlJobConfig.EtlIndex roll2Index = new EtlJobConfig.EtlIndex(10002, - roll2Columns, 12347, "AGGREGATE", false); + roll2Columns, 12347, "AGGREGATE", false, 1); List roll3Columns = new ArrayList<>(); roll3Columns.add(column3); roll3Columns.add(column4); EtlJobConfig.EtlIndex roll3Index = new EtlJobConfig.EtlIndex(10003, - roll3Columns, 12348, "AGGREGATE", false); + roll3Columns, 12348, "AGGREGATE", false, 1); List indexes = new ArrayList<>(); indexes.add(baseIndex); diff --git a/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java b/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java index 0ea7f660923834..bb9de716438d71 100644 --- a/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java +++ b/fe/spark-dpp/src/test/java/org/apache/doris/load/loadv2/etl/SparkEtlJobTest.java @@ -68,9 +68,9 @@ public void setUp() { EtlColumn k1 = new EtlColumn("k1", "INT", false, true, "NONE", "0", 0, 0, 0); EtlColumn k2 = new EtlColumn("k2", "VARCHAR", false, true, "NONE", "0", 10, 0, 0); EtlColumn v1 = new EtlColumn("v1", "BIGINT", false, false, "NONE", "0", 0, 0, 0); - EtlIndex index1 = new EtlIndex(index1Id, Lists.newArrayList(k1, k2, v1), 666666, "DUPLICATE", true); + EtlIndex index1 = new EtlIndex(index1Id, Lists.newArrayList(k1, k2, v1), 666666, "DUPLICATE", true, 1); v1 = new EtlColumn("v1", "BIGINT", false, false, "SUM", "0", 0, 0, 0); - EtlIndex index2 = new EtlIndex(index2Id, Lists.newArrayList(k1, v1), 888888, "AGGREGATE", true); + EtlIndex index2 = new EtlIndex(index2Id, Lists.newArrayList(k1, v1), 888888, "AGGREGATE", true, 1); List indexes = Lists.newArrayList(index1, index2); // partition info List partitions = Lists.newArrayList(); diff --git a/regression-test/data/load_p0/ingestion_load/data.parquet b/regression-test/data/load_p0/ingestion_load/data.parquet new file mode 100644 index 00000000000000..81d0d01460cc5b Binary files /dev/null and b/regression-test/data/load_p0/ingestion_load/data.parquet differ diff --git a/regression-test/data/load_p0/ingestion_load/data1.parquet b/regression-test/data/load_p0/ingestion_load/data1.parquet new file mode 100644 index 00000000000000..623456ace7430d Binary files /dev/null and b/regression-test/data/load_p0/ingestion_load/data1.parquet differ diff --git a/regression-test/data/load_p0/ingestion_load/data2-0.parquet b/regression-test/data/load_p0/ingestion_load/data2-0.parquet new file mode 100644 index 00000000000000..ef0e63887c114c Binary files /dev/null and b/regression-test/data/load_p0/ingestion_load/data2-0.parquet differ diff --git a/regression-test/data/load_p0/ingestion_load/data2-1.parquet b/regression-test/data/load_p0/ingestion_load/data2-1.parquet new file mode 100644 index 00000000000000..a1c388a7bf65c4 Binary files /dev/null and b/regression-test/data/load_p0/ingestion_load/data2-1.parquet differ diff --git a/regression-test/data/load_p0/ingestion_load/data2-2.parquet b/regression-test/data/load_p0/ingestion_load/data2-2.parquet new file mode 100644 index 00000000000000..720ea77afd23d4 Binary files /dev/null and b/regression-test/data/load_p0/ingestion_load/data2-2.parquet differ diff --git a/regression-test/data/load_p0/ingestion_load/data2-3.parquet b/regression-test/data/load_p0/ingestion_load/data2-3.parquet new file mode 100644 index 00000000000000..cc2b37a7c03fea Binary files /dev/null and b/regression-test/data/load_p0/ingestion_load/data2-3.parquet differ diff --git a/regression-test/data/load_p0/ingestion_load/test_ingestion_load.out b/regression-test/data/load_p0/ingestion_load/test_ingestion_load.out new file mode 100644 index 00000000000000..f8ce916c7de5fd --- /dev/null +++ b/regression-test/data/load_p0/ingestion_load/test_ingestion_load.out @@ -0,0 +1,37 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +-2067215761 41V4fgCKI3Hkioe DZGaq24Yqh7SwmbPT6IX23jfC5NKqG7gE9JT4GLwiaQtoO8l6EjhGWQP9X7NHmjdqMbIN5kNeDkffOrlS6roIwj2wXpJ true 123 -4524 1014223769452772206 -6589390612399245616 29103.387 -4.09385322835896E8 886.394 -699.960 2024-02-22 2024-03-04 2024-01-01T22:13:54 2023-10-02T19:35:10 +-1982022740 fcZ1o6ZXG8UOFh5 iw4Ziys42GRRTFNkVPeQEA9I5EQtBD04xfefDsPCWN0vr1 true -3 -25143 7122614823242399351 7391807634038366248 23160.604 -9.20283206353984E8 829.880 -387.403 2023-10-16 2024-05-13 2024-07-16T18:27:45 2023-11-03T05:30:21 +-1228926576 1TISaGtB01BiqVt kq4u false -123 15962 3590007830423934951 -1478759439092857810 -7813.757 -6.98785793100899E8 930.743 402.350 2024-07-23 2023-07-30 2023-11-27T17:48:50 2024-03-11T21:09:58 +-575060895 rRfatUJiOO5dq9Y ETjqrUNUnI5kSmkafjWfRTG8HIp98pLGXagNpXZHqOIZZDRkoGeahOwk9 false 16 -767 6623730208927396375 -3055706367894284822 12540.839 -1.047911096098831E9 -752.454 -241.620 2024-04-10 2024-05-16 2023-12-07T23:38:05 2023-12-11T05:48:36 +-76042627 PcVaKC43qmIzuxY U3aGxaZumFpqcUsLI true 44 31151 9085406701767055602 -5846138572199996843 -16845.29 2.44522690225531E8 -784.720 -467.133 2023-10-31 2023-08-29 2023-09-12T10:12:46 2023-10-19T17:02:51 +121048200 KPLWjhhbGXqflJi rzqOYQH9ySHPwCm5K4GdeuI28G8LLmnpqLmsLMLfyRIvcfrlubQI47wUa8QILhuS38MBkjL true 42 13182 -6601530758880565531 5619594098883737912 -2782.1506 3.86698722676211E8 478.420 -330.289 2024-06-17 2023-12-26 2024-04-28T03:29:04 2023-08-18T21:05:32 +262860291 m3XgmlbIHYNH1qS BTJRzVrpM78zJAsHMEGhkF5BiDoc3yJuoV0s209sFcqElZsheBgolBGlFl9X4EfauD64FcFF2Mi4V0dKZfpDgaLLRPfG1SALV7 false -42 5990 -7504815416577235660 1535659792778122944 1171.9619 1.28834143701229E8 626.721 682.828 2023-11-24 2023-11-18 2024-03-21T11:50:17 2024-03-31T12:59:27 +579428879 KsOC6WGrieGlo7B SzeA6tRbsiGWJTBDvBQdBjCqjSE6Y false -111 32758 4029182463831656671 -3546198025044093789 20338.55 -2.015222388533773E9 61.981 720.310 2023-11-13 2024-07-04 2024-07-19T12:42:28 2024-01-04T10:32:53 +1145092610 xWJUDWAV8Nllo0F dnZ9RMVdoqxh4kGBvy55zQdChNTVYdlvRZP4aWIkXyErUbM1XmFGQ9vuCD113JKKCyx4crDoY false 115 -22832 -7242855305248390982 -4240353246453053617 -9074.909 -2.51212400295869E8 -502.410 618.820 2024-06-12 2024-04-18 2023-11-04T09:55:17 2023-11-13T16:30:23 +1736373707 UU14wnLhPkBid41 pmuNqYfOc3JCscf9meT5dYB2i28Pt9iaeXK4QqjVZJdoKFOeZI5bG9RKm1zInTdDMW1N0PKI5Y true -105 -20276 360048259532857165 -4602633478165721463 -13230.296 -1.708246954394742E9 757.147 -533.800 2024-01-05 2023-09-08 2023-11-27T05:21:33 2024-02-11T21:35:03 + +-- !select -- +-2067215761 41V4fgCKI3Hkioe DZGaq24Yqh7SwmbPT6IX23jfC5NKqG7gE9JT4GLwiaQtoO8l6EjhGWQP9X7NHmjdqMbIN5kNeDkffOrlS6roIwj2wXpJ true 123 -4524 1014223769452772206 -6589390612399245616 29103.387 -4.09385322835896E8 886.394 -699.960 2024-02-22 2024-03-04 2024-01-01T22:13:54 2023-10-02T19:35:10 +-1982022740 fcZ1o6ZXG8UOFh5 iw4Ziys42GRRTFNkVPeQEA9I5EQtBD04xfefDsPCWN0vr1 true -3 -25143 7122614823242399351 7391807634038366248 23160.604 -9.20283206353984E8 829.880 -387.403 2023-10-16 2024-05-13 2024-07-16T18:27:45 2023-11-03T05:30:21 +-1228926576 1TISaGtB01BiqVt kq4u false -123 15962 3590007830423934951 -1478759439092857810 -7813.757 -6.98785793100899E8 930.743 402.350 2024-07-23 2023-07-30 2023-11-27T17:48:50 2024-03-11T21:09:58 +-575060895 rRfatUJiOO5dq9Y ETjqrUNUnI5kSmkafjWfRTG8HIp98pLGXagNpXZHqOIZZDRkoGeahOwk9 false 16 -767 6623730208927396375 -3055706367894284822 12540.839 -1.047911096098831E9 -752.454 -241.620 2024-04-10 2024-05-16 2023-12-07T23:38:05 2023-12-11T05:48:36 +-76042627 PcVaKC43qmIzuxY U3aGxaZumFpqcUsLI true 44 31151 9085406701767055602 -5846138572199996843 -16845.29 2.44522690225531E8 -784.720 -467.133 2023-10-31 2023-08-29 2023-09-12T10:12:46 2023-10-19T17:02:51 +121048200 KPLWjhhbGXqflJi rzqOYQH9ySHPwCm5K4GdeuI28G8LLmnpqLmsLMLfyRIvcfrlubQI47wUa8QILhuS38MBkjL true 42 13182 -6601530758880565531 5619594098883737912 -2782.1506 3.86698722676211E8 478.420 -330.289 2024-06-17 2023-12-26 2024-04-28T03:29:04 2023-08-18T21:05:32 +262860291 m3XgmlbIHYNH1qS BTJRzVrpM78zJAsHMEGhkF5BiDoc3yJuoV0s209sFcqElZsheBgolBGlFl9X4EfauD64FcFF2Mi4V0dKZfpDgaLLRPfG1SALV7 false -42 5990 -7504815416577235660 1535659792778122944 1171.9619 1.28834143701229E8 626.721 682.828 2023-11-24 2023-11-18 2024-03-21T11:50:17 2024-03-31T12:59:27 +579428879 KsOC6WGrieGlo7B SzeA6tRbsiGWJTBDvBQdBjCqjSE6Y false -111 32758 4029182463831656671 -3546198025044093789 20338.55 -2.015222388533773E9 61.981 720.310 2023-11-13 2024-07-04 2024-07-19T12:42:28 2024-01-04T10:32:53 +1145092610 xWJUDWAV8Nllo0F dnZ9RMVdoqxh4kGBvy55zQdChNTVYdlvRZP4aWIkXyErUbM1XmFGQ9vuCD113JKKCyx4crDoY false 115 -22832 -7242855305248390982 -4240353246453053617 -9074.909 -2.51212400295869E8 -502.410 618.820 2024-06-12 2024-04-18 2023-11-04T09:55:17 2023-11-13T16:30:23 +1736373707 UU14wnLhPkBid41 pmuNqYfOc3JCscf9meT5dYB2i28Pt9iaeXK4QqjVZJdoKFOeZI5bG9RKm1zInTdDMW1N0PKI5Y true -105 -20276 360048259532857165 -4602633478165721463 -13230.296 -1.708246954394742E9 757.147 -533.800 2024-01-05 2023-09-08 2023-11-27T05:21:33 2024-02-11T21:35:03 + +-- !select -- +-9022291871392468311 2023-11-02 mOWPGHmiZ 10784 -128 2023-11-12T18:06:21 4218616419351308798 1993977685 -1857678846 +-6045452612961149194 2024-06-23 G1j 28468 -55 2024-06-09T00:12:11 -6456263257174124469 -727277974 144696403 +-1537906159489906139 2024-04-04 MRMRE18bVh49RD 32763 98 2024-01-20T00:54:03 -1289145371043997006 128181215 -1295829474 +-1510882223779118241 2024-07-24 PCwFn7r21MZr 22960 -79 2024-02-07T18:15:07 -8437284610883885859 472729036 -39626304 +-1185467471318572316 2023-11-08 ieed5Msw8X6be4HGS 16555 -79 2024-07-28T23:08:29 3263664376405334754 -809360772 -1229995615 +-234810200663664160 2024-06-07 s7GIrN805aU3cs2EM -7555 -124 2023-12-28T18:59:15 -3600712745035417587 2035647886 126756427 +4461660295430359180 2024-04-23 K 25428 6 2023-11-15T18:38:20 -4503242152141666001 -1093190312 1511443278 +6742880469957921530 2024-05-02 cJJrvRJfpCuGh 27232 64 2024-08-18T09:46:50 -2607385663861429432 -1390108377 1758263623 +7252685688720766402 2024-03-13 891C2 -9774 -1 2023-10-12T19:45:28 -3210623791036109982 -915986651 -1794344594 +8278077411585505009 2023-11-17 gBesLQnYpjK7iDUUcIi -26656 -50 2023-12-11T14:29:52 -8301529943262026214 -1555756888 -1318983102 + diff --git a/regression-test/data/load_p0/ingestion_load/test_ingestion_load_multi_table.out b/regression-test/data/load_p0/ingestion_load/test_ingestion_load_multi_table.out new file mode 100644 index 00000000000000..7a3ec8e86ddd05 --- /dev/null +++ b/regression-test/data/load_p0/ingestion_load/test_ingestion_load_multi_table.out @@ -0,0 +1,25 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +-2067215761 41V4fgCKI3Hkioe DZGaq24Yqh7SwmbPT6IX23jfC5NKqG7gE9JT4GLwiaQtoO8l6EjhGWQP9X7NHmjdqMbIN5kNeDkffOrlS6roIwj2wXpJ true 123 -4524 1014223769452772206 -6589390612399245616 29103.387 -4.09385322835896E8 886.394 -699.960 2024-02-22 2024-03-04 2024-01-01T22:13:54 2023-10-02T19:35:10 +-1982022740 fcZ1o6ZXG8UOFh5 iw4Ziys42GRRTFNkVPeQEA9I5EQtBD04xfefDsPCWN0vr1 true -3 -25143 7122614823242399351 7391807634038366248 23160.604 -9.20283206353984E8 829.880 -387.403 2023-10-16 2024-05-13 2024-07-16T18:27:45 2023-11-03T05:30:21 +-1228926576 1TISaGtB01BiqVt kq4u false -123 15962 3590007830423934951 -1478759439092857810 -7813.757 -6.98785793100899E8 930.743 402.350 2024-07-23 2023-07-30 2023-11-27T17:48:50 2024-03-11T21:09:58 +-575060895 rRfatUJiOO5dq9Y ETjqrUNUnI5kSmkafjWfRTG8HIp98pLGXagNpXZHqOIZZDRkoGeahOwk9 false 16 -767 6623730208927396375 -3055706367894284822 12540.839 -1.047911096098831E9 -752.454 -241.620 2024-04-10 2024-05-16 2023-12-07T23:38:05 2023-12-11T05:48:36 +-76042627 PcVaKC43qmIzuxY U3aGxaZumFpqcUsLI true 44 31151 9085406701767055602 -5846138572199996843 -16845.29 2.44522690225531E8 -784.720 -467.133 2023-10-31 2023-08-29 2023-09-12T10:12:46 2023-10-19T17:02:51 +121048200 KPLWjhhbGXqflJi rzqOYQH9ySHPwCm5K4GdeuI28G8LLmnpqLmsLMLfyRIvcfrlubQI47wUa8QILhuS38MBkjL true 42 13182 -6601530758880565531 5619594098883737912 -2782.1506 3.86698722676211E8 478.420 -330.289 2024-06-17 2023-12-26 2024-04-28T03:29:04 2023-08-18T21:05:32 +262860291 m3XgmlbIHYNH1qS BTJRzVrpM78zJAsHMEGhkF5BiDoc3yJuoV0s209sFcqElZsheBgolBGlFl9X4EfauD64FcFF2Mi4V0dKZfpDgaLLRPfG1SALV7 false -42 5990 -7504815416577235660 1535659792778122944 1171.9619 1.28834143701229E8 626.721 682.828 2023-11-24 2023-11-18 2024-03-21T11:50:17 2024-03-31T12:59:27 +579428879 KsOC6WGrieGlo7B SzeA6tRbsiGWJTBDvBQdBjCqjSE6Y false -111 32758 4029182463831656671 -3546198025044093789 20338.55 -2.015222388533773E9 61.981 720.310 2023-11-13 2024-07-04 2024-07-19T12:42:28 2024-01-04T10:32:53 +1145092610 xWJUDWAV8Nllo0F dnZ9RMVdoqxh4kGBvy55zQdChNTVYdlvRZP4aWIkXyErUbM1XmFGQ9vuCD113JKKCyx4crDoY false 115 -22832 -7242855305248390982 -4240353246453053617 -9074.909 -2.51212400295869E8 -502.410 618.820 2024-06-12 2024-04-18 2023-11-04T09:55:17 2023-11-13T16:30:23 +1736373707 UU14wnLhPkBid41 pmuNqYfOc3JCscf9meT5dYB2i28Pt9iaeXK4QqjVZJdoKFOeZI5bG9RKm1zInTdDMW1N0PKI5Y true -105 -20276 360048259532857165 -4602633478165721463 -13230.296 -1.708246954394742E9 757.147 -533.800 2024-01-05 2023-09-08 2023-11-27T05:21:33 2024-02-11T21:35:03 + +-- !select -- +-2067215761 41V4fgCKI3Hkioe DZGaq24Yqh7SwmbPT6IX23jfC5NKqG7gE9JT4GLwiaQtoO8l6EjhGWQP9X7NHmjdqMbIN5kNeDkffOrlS6roIwj2wXpJ true 123 -4524 1014223769452772206 -6589390612399245616 29103.387 -4.09385322835896E8 886.394 -699.960 2024-02-22 2024-03-04 2024-01-01T22:13:54 2023-10-02T19:35:10 +-1982022740 fcZ1o6ZXG8UOFh5 iw4Ziys42GRRTFNkVPeQEA9I5EQtBD04xfefDsPCWN0vr1 true -3 -25143 7122614823242399351 7391807634038366248 23160.604 -9.20283206353984E8 829.880 -387.403 2023-10-16 2024-05-13 2024-07-16T18:27:45 2023-11-03T05:30:21 +-1228926576 1TISaGtB01BiqVt kq4u false -123 15962 3590007830423934951 -1478759439092857810 -7813.757 -6.98785793100899E8 930.743 402.350 2024-07-23 2023-07-30 2023-11-27T17:48:50 2024-03-11T21:09:58 +-575060895 rRfatUJiOO5dq9Y ETjqrUNUnI5kSmkafjWfRTG8HIp98pLGXagNpXZHqOIZZDRkoGeahOwk9 false 16 -767 6623730208927396375 -3055706367894284822 12540.839 -1.047911096098831E9 -752.454 -241.620 2024-04-10 2024-05-16 2023-12-07T23:38:05 2023-12-11T05:48:36 +-76042627 PcVaKC43qmIzuxY U3aGxaZumFpqcUsLI true 44 31151 9085406701767055602 -5846138572199996843 -16845.29 2.44522690225531E8 -784.720 -467.133 2023-10-31 2023-08-29 2023-09-12T10:12:46 2023-10-19T17:02:51 +121048200 KPLWjhhbGXqflJi rzqOYQH9ySHPwCm5K4GdeuI28G8LLmnpqLmsLMLfyRIvcfrlubQI47wUa8QILhuS38MBkjL true 42 13182 -6601530758880565531 5619594098883737912 -2782.1506 3.86698722676211E8 478.420 -330.289 2024-06-17 2023-12-26 2024-04-28T03:29:04 2023-08-18T21:05:32 +262860291 m3XgmlbIHYNH1qS BTJRzVrpM78zJAsHMEGhkF5BiDoc3yJuoV0s209sFcqElZsheBgolBGlFl9X4EfauD64FcFF2Mi4V0dKZfpDgaLLRPfG1SALV7 false -42 5990 -7504815416577235660 1535659792778122944 1171.9619 1.28834143701229E8 626.721 682.828 2023-11-24 2023-11-18 2024-03-21T11:50:17 2024-03-31T12:59:27 +579428879 KsOC6WGrieGlo7B SzeA6tRbsiGWJTBDvBQdBjCqjSE6Y false -111 32758 4029182463831656671 -3546198025044093789 20338.55 -2.015222388533773E9 61.981 720.310 2023-11-13 2024-07-04 2024-07-19T12:42:28 2024-01-04T10:32:53 +1145092610 xWJUDWAV8Nllo0F dnZ9RMVdoqxh4kGBvy55zQdChNTVYdlvRZP4aWIkXyErUbM1XmFGQ9vuCD113JKKCyx4crDoY false 115 -22832 -7242855305248390982 -4240353246453053617 -9074.909 -2.51212400295869E8 -502.410 618.820 2024-06-12 2024-04-18 2023-11-04T09:55:17 2023-11-13T16:30:23 +1736373707 UU14wnLhPkBid41 pmuNqYfOc3JCscf9meT5dYB2i28Pt9iaeXK4QqjVZJdoKFOeZI5bG9RKm1zInTdDMW1N0PKI5Y true -105 -20276 360048259532857165 -4602633478165721463 -13230.296 -1.708246954394742E9 757.147 -533.800 2024-01-05 2023-09-08 2023-11-27T05:21:33 2024-02-11T21:35:03 + diff --git a/regression-test/data/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.out b/regression-test/data/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.out new file mode 100644 index 00000000000000..f39b6b66d29108 --- /dev/null +++ b/regression-test/data/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.out @@ -0,0 +1,13 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +-2067215761 41V4fgCKI3Hkioe DZGaq24Yqh7SwmbPT6IX23jfC5NKqG7gE9JT4GLwiaQtoO8l6EjhGWQP9X7NHmjdqMbIN5kNeDkffOrlS6roIwj2wXpJ true 123 -4524 1014223769452772206 -6589390612399245616 29103.387 -4.09385322835896E8 886.394 -699.960 2024-02-22 2024-03-04 2024-01-01T22:13:54 2023-10-02T19:35:10 +-1982022740 fcZ1o6ZXG8UOFh5 iw4Ziys42GRRTFNkVPeQEA9I5EQtBD04xfefDsPCWN0vr1 true -3 -25143 7122614823242399351 7391807634038366248 23160.604 -9.20283206353984E8 829.880 -387.403 2023-10-16 2024-05-13 2024-07-16T18:27:45 2023-11-03T05:30:21 +-1228926576 1TISaGtB01BiqVt kq4u false -123 15962 3590007830423934951 -1478759439092857810 -7813.757 -6.98785793100899E8 930.743 402.350 2024-07-23 2023-07-30 2023-11-27T17:48:50 2024-03-11T21:09:58 +-575060895 rRfatUJiOO5dq9Y ETjqrUNUnI5kSmkafjWfRTG8HIp98pLGXagNpXZHqOIZZDRkoGeahOwk9 false 16 -767 6623730208927396375 -3055706367894284822 12540.839 -1.047911096098831E9 -752.454 -241.620 2024-04-10 2024-05-16 2023-12-07T23:38:05 2023-12-11T05:48:36 +-76042627 PcVaKC43qmIzuxY U3aGxaZumFpqcUsLI true 44 31151 9085406701767055602 -5846138572199996843 -16845.29 2.44522690225531E8 -784.720 -467.133 2023-10-31 2023-08-29 2023-09-12T10:12:46 2023-10-19T17:02:51 +121048200 KPLWjhhbGXqflJi rzqOYQH9ySHPwCm5K4GdeuI28G8LLmnpqLmsLMLfyRIvcfrlubQI47wUa8QILhuS38MBkjL true 42 13182 -6601530758880565531 5619594098883737912 -2782.1506 3.86698722676211E8 478.420 -330.289 2024-06-17 2023-12-26 2024-04-28T03:29:04 2023-08-18T21:05:32 +262860291 m3XgmlbIHYNH1qS BTJRzVrpM78zJAsHMEGhkF5BiDoc3yJuoV0s209sFcqElZsheBgolBGlFl9X4EfauD64FcFF2Mi4V0dKZfpDgaLLRPfG1SALV7 false -42 5990 -7504815416577235660 1535659792778122944 1171.9619 1.28834143701229E8 626.721 682.828 2023-11-24 2023-11-18 2024-03-21T11:50:17 2024-03-31T12:59:27 +579428879 KsOC6WGrieGlo7B SzeA6tRbsiGWJTBDvBQdBjCqjSE6Y false -111 32758 4029182463831656671 -3546198025044093789 20338.55 -2.015222388533773E9 61.981 720.310 2023-11-13 2024-07-04 2024-07-19T12:42:28 2024-01-04T10:32:53 +1145092610 xWJUDWAV8Nllo0F dnZ9RMVdoqxh4kGBvy55zQdChNTVYdlvRZP4aWIkXyErUbM1XmFGQ9vuCD113JKKCyx4crDoY false 115 -22832 -7242855305248390982 -4240353246453053617 -9074.909 -2.51212400295869E8 -502.410 618.820 2024-06-12 2024-04-18 2023-11-04T09:55:17 2023-11-13T16:30:23 +1736373707 UU14wnLhPkBid41 pmuNqYfOc3JCscf9meT5dYB2i28Pt9iaeXK4QqjVZJdoKFOeZI5bG9RKm1zInTdDMW1N0PKI5Y true -105 -20276 360048259532857165 -4602633478165721463 -13230.296 -1.708246954394742E9 757.147 -533.800 2024-01-05 2023-09-08 2023-11-27T05:21:33 2024-02-11T21:35:03 + diff --git a/regression-test/data/load_p0/ingestion_load/test_ingestion_load_with_partition.out b/regression-test/data/load_p0/ingestion_load/test_ingestion_load_with_partition.out new file mode 100644 index 00000000000000..37d0553e58c3c5 --- /dev/null +++ b/regression-test/data/load_p0/ingestion_load/test_ingestion_load_with_partition.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !select -- +2024-09-01 5 +2024-09-02 1 +2024-09-03 1 +2024-09-04 3 + diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load.groovy new file mode 100644 index 00000000000000..a6e213bba89f2a --- /dev/null +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load.groovy @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import java.nio.file.Files +import java.nio.file.Paths +import java.nio.file.StandardCopyOption + +suite('test_ingestion_load', 'p0') { + + def testIngestLoadJob = { testTable, loadLabel, String dataFile -> + + sql "TRUNCATE TABLE ${testTable}" + + sql "CLEAN LABEL FROM ${context.dbName}" + + Integer loadId = -1 + Integer tableId = -1 + Integer partitionId = -1 + Integer indexId = -1 + Integer bucketId = 0 + Integer schemaHash = -1 + + String reqBody = + """{ + "label": "${loadLabel}", + "tableToPartition": { + "${testTable}": [] + }, + "properties": {} + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_create" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body reqBody + check { code, resBody -> + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + def data = resBodyJson.data + loadId = data.loadId + def tableMeta = data.tableMeta + tableId = tableMeta["${testTable}"].id + def index = tableMeta["${testTable}"].indexes[0] + indexId = index.indexId + schemaHash = index.schemaHash + partitionId = tableMeta["${testTable}"].partitionInfo.partitions[0].partitionId + } + } + + String resultFileName = "V1.${loadLabel}.${tableId}.${partitionId}.${indexId}.${bucketId}.${schemaHash}.parquet" + logger.info("resultFileName: " + resultFileName) + + Files.copy(Paths.get(dataFile), + Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileName}"), StandardCopyOption.REPLACE_EXISTING) + + String etlResultFilePath = uploadToHdfs "/load_p0/ingestion_load/${resultFileName}" + + String dppResult = '{\\"isSuccess\\":true,\\"failedReason\\":\\"\\",\\"scannedRows\\":10,\\"fileNumber\\":1,' + + '\\"fileSize\\":2441,\\"normalRows\\":10,\\"abnormalRows\\":0,\\"unselectRows\\":0,' + + '\\"partialAbnormalRows\\":\\"[]\\",\\"scannedBytes\\":0}' + + String updateStatusReqBody = + """{ + "loadId": ${loadId}, + "statusInfo": { + "status": "SUCCESS", + "msg": "", + "appId": "", + "dppResult": "${dppResult}", + "filePathToSize": "{\\"${etlResultFilePath}\\": 81758}", + "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" + } + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_update" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body updateStatusReqBody + check { code, resBody -> + { + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + } + } + } + + max_try_milli_secs = 120000 + while (max_try_milli_secs) { + result = sql "show load where label = '${loadLabel}'" + if (result[0][2] == "FINISHED") { + sql "sync" + qt_select "select * from ${testTable} order by 1" + break + } else { + sleep(5000) // wait 1 second every time + max_try_milli_secs -= 5000 + if (max_try_milli_secs <= 0) { + assertEquals(1, 2) + } + } + } + + } + + if (enableHdfs()) { + + tableName = 'tbl_test_spark_load' + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL + ) + DUPLICATE KEY(c_int) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + def label = "test_ingestion_load" + + testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet') + + tableName = 'tbl_test_spark_load_unique_mor' + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL + ) + UNIQUE KEY(c_int) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "false" + ) + """ + + label = "test_ingestion_load_unique_mor" + + testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet') + + tableName = 'tbl_test_spark_load_agg' + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} + ( + `user_id` LARGEINT NOT NULL COMMENT "user id", + `date` DATE NOT NULL COMMENT "data import time", + `city` VARCHAR(20) COMMENT "city", + `age` SMALLINT COMMENT "age", + `sex` TINYINT COMMENT "gender", + `last_visit_date` DATETIME REPLACE DEFAULT "1970-01-01 00:00:00" COMMENT "last visit date time", + `cost` BIGINT SUM DEFAULT "0" COMMENT "user total cost", + `max_dwell_time` INT MAX DEFAULT "0" COMMENT "user max dwell time", + `min_dwell_time` INT MIN DEFAULT "99999" COMMENT "user min dwell time" + ) + AGGREGATE KEY(`user_id`, `date`, `city`, `age`, `sex`) + DISTRIBUTED BY HASH(`user_id`) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + label = "test_ingestion_load_agg" + + testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data1.parquet') + + } + +} diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy new file mode 100644 index 00000000000000..4a56663d6291ed --- /dev/null +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_column.groovy @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import java.nio.file.Files +import java.nio.file.Paths +import java.nio.file.StandardCopyOption + +suite('test_ingestion_load_alter_column', 'p0') { + + def testIngestLoadJob = { testTable, loadLabel, dataFile, alterAction -> + + sql "TRUNCATE TABLE ${testTable}" + + sql "CLEAN LABEL FROM ${context.dbName}" + + Integer loadId = -1 + Integer tableId = -1 + Integer partitionId = -1 + Integer indexId = -1 + Integer bucketId = 0 + Integer schemaHash = -1 + + String reqBody = + """{ + "label": "${loadLabel}", + "tableToPartition": { + "${testTable}": [] + }, + "properties": {} + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_create" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body reqBody + check { code, resBody -> + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + def data = resBodyJson.data + loadId = data.loadId + def tableMeta = data.tableMeta + tableId = tableMeta["${testTable}"].id + def index = tableMeta["${testTable}"].indexes[0] + indexId = index.indexId + schemaHash = index.schemaHash + partitionId = tableMeta["${testTable}"].partitionInfo.partitions[0].partitionId + } + } + + String resultFileName = "V1.${loadLabel}.${tableId}.${partitionId}.${indexId}.${bucketId}.${schemaHash}.parquet" + logger.info("resultFileName: " + resultFileName) + + Files.copy(Paths.get(dataFile), + Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileName}"), StandardCopyOption.REPLACE_EXISTING) + + String etlResultFilePath = uploadToHdfs "/load_p0/ingestion_load/${resultFileName}" + + String dppResult = '{\\"isSuccess\\":true,\\"failedReason\\":\\"\\",\\"scannedRows\\":10,\\"fileNumber\\":1,' + + '\\"fileSize\\":2441,\\"normalRows\\":10,\\"abnormalRows\\":0,\\"unselectRows\\":0,' + + '\\"partialAbnormalRows\\":\\"[]\\",\\"scannedBytes\\":0}' + + String updateStatusReqBody = + """{ + "loadId": ${loadId}, + "statusInfo": { + "status": "SUCCESS", + "msg": "", + "appId": "", + "dppResult": "${dppResult}", + "filePathToSize": "{\\"${etlResultFilePath}\\": 81758}", + "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" + } + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_update" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body updateStatusReqBody + check { code, resBody -> + { + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + } + } + } + + alterAction.call() + + max_try_milli_secs = 120000 + while (max_try_milli_secs) { + result = sql "show load where label = '${loadLabel}'" + if (result[0][2] == "CANCELLED") { + msg = result[0][7] + logger.info("err msg: " + msg) + assertTrue((result[0][7] =~ /schema of index \[\d+\] has changed/).find()) + break + } else { + sleep(5000) // wait 1 second every time + max_try_milli_secs -= 5000 + if (max_try_milli_secs <= 0) { + assertEquals(1, 2) + } + } + } + + } + + if (enableHdfs()) { + + tableName1 = 'tbl_test_spark_load_alter_column_1' + tableName2 = 'tbl_test_spark_load_alter_column_2' + + try { + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName1} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL + ) + DUPLICATE KEY(c_int) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + label = "test_ingestion_load_alter_column_1" + + testIngestLoadJob.call(tableName1, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet', { + sql "alter table ${tableName1} drop column c_datetimev2" + }) + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName2} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL + ) + DUPLICATE KEY(c_int) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + label = "test_ingestion_load_alter_column_2" + + testIngestLoadJob.call(tableName2, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet', { + sql "alter table ${tableName2} add column c_string string null" + }) + + } finally { + sql "DROP TABLE ${tableName1}" + sql "DROP TABLE ${tableName2}" + } + + } + +} \ No newline at end of file diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_partition.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_partition.groovy new file mode 100644 index 00000000000000..de91935710294b --- /dev/null +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_alter_partition.groovy @@ -0,0 +1,224 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import java.nio.file.Files +import java.nio.file.Paths +import java.nio.file.StandardCopyOption + +suite('test_ingestion_load_alter_partition', 'p0') { + + def testIngestLoadJob = { testTable, loadLabel, dataFiles, alterAction -> + + sql "TRUNCATE TABLE ${testTable}" + + sql "CLEAN LABEL ${loadLabel} FROM ${context.dbName}" + + Integer loadId = -1 + Integer tableId = -1 + Integer partitionId = -1 + Integer indexId = -1 + Integer bucketId = 0 + Integer schemaHash = -1 + + String reqBody = + """{ + "label": "${loadLabel}", + "tableToPartition": { + "${testTable}": [] + }, + "properties": {} + }""" + + resultFileNames = [] + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_create" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body reqBody + check { code, resBody -> + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + def data = resBodyJson.data + loadId = data.loadId + def tableMeta = data.tableMeta + tableId = tableMeta["${testTable}"].id + def index = tableMeta["${testTable}"].indexes[0] + indexId = index.indexId + schemaHash = index.schemaHash + partitions = tableMeta["${testTable}"].partitionInfo.partitions + for(partition in partitions) { + logger.info("partitionId: " + partition.partitionId) + resultFileNames.add("V1.${loadLabel}.${tableId}.${partition.partitionId}.${indexId}.${bucketId}.${schemaHash}.parquet") + } + } + } + + etlResultFilePaths = [] + for(int i=0; i < dataFiles.size(); i++) { + Files.copy(Paths.get(dataFiles[i]), + Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileNames[i]}"), StandardCopyOption.REPLACE_EXISTING) + String etlResultFilePath = uploadToHdfs "/load_p0/ingestion_load/${resultFileNames[i]}" + logger.info("etlResultFilePath: " + etlResultFilePath) + etlResultFilePaths.add(etlResultFilePath) + } + + String dppResult = '{\\"isSuccess\\":true,\\"failedReason\\":\\"\\",\\"scannedRows\\":10,\\"fileNumber\\":1,' + + '\\"fileSize\\":2441,\\"normalRows\\":10,\\"abnormalRows\\":0,\\"unselectRows\\":0,' + + '\\"partialAbnormalRows\\":\\"[]\\",\\"scannedBytes\\":0}' + + String updateStatusReqBody = + """{ + "loadId": ${loadId}, + "statusInfo": { + "status": "SUCCESS", + "msg": "", + "appId": "", + "dppResult": "${dppResult}", + "filePathToSize": "{\\"${etlResultFilePaths.get(0)}\\":851,\\"${etlResultFilePaths.get(1)}\\":781,\\"${etlResultFilePaths.get(2)}\\":781,\\"${etlResultFilePaths.get(3)}\\":839}", + "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" + } + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_update" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body updateStatusReqBody + check { code, resBody -> + { + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + } + } + } + + alterAction.call() + + max_try_milli_secs = 120000 + while (max_try_milli_secs) { + result = sql "show load where label = '${loadLabel}'" + if (result[0][2] == "FINISHED") { + sql "sync" + qt_select "select c1, count(*) from ${testTable} group by c1 order by c1" + break + } else if (result[0][2] == "CANCELLED") { + msg = result[0][7] + logger.info("err msg: " + msg) + assertTrue((result[0][7] =~ /partition does not exist/).find()) + break + } else { + sleep(5000) // wait 1 second every time + max_try_milli_secs -= 5000 + if (max_try_milli_secs <= 0) { + assertEquals(1, 2) + } + } + } + + } + + if (enableHdfs()) { + + tableName1 = 'tbl_test_spark_load_alter_partition_1' + tableName2 = 'tbl_test_spark_load_alter_partition_2' + tableName3 = 'tbl_test_spark_load_alter_partition_3' + + try { + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName1} ( + c0 int not null, + c1 date, + c2 varchar(64) + ) + DUPLICATE KEY(c0) + PARTITION BY RANGE(c1) ( + FROM ("2024-09-01") TO ("2024-09-05") INTERVAL 1 DAY + ) + DISTRIBUTED BY HASH(c0) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + label = "test_ingestion_load_alter_partition_1" + + testIngestLoadJob.call(tableName1, label, [context.config.dataPath + '/load_p0/ingestion_load/data2-0.parquet', context.config.dataPath + '/load_p0/ingestion_load/data2-1.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-2.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-3.parquet'], { + sql "alter table ${tableName1} drop partition p_20240901" + }) + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName2} ( + c0 int not null, + c1 date, + c2 varchar(64) + ) + DUPLICATE KEY(c0) + PARTITION BY RANGE(c1) ( + FROM ("2024-09-01") TO ("2024-09-05") INTERVAL 1 DAY + ) + DISTRIBUTED BY HASH(c0) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + label = "test_ingestion_load_alter_partition_2" + + testIngestLoadJob.call(tableName2, label, [context.config.dataPath + '/load_p0/ingestion_load/data2-0.parquet', context.config.dataPath + '/load_p0/ingestion_load/data2-1.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-2.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-3.parquet'], { + sql "alter table ${tableName2} add partition p_20240905 VALUES [('2024-09-05'), ('2024-09-06'))" + }) + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName3} ( + c0 int not null, + c1 date, + c2 varchar(64) + ) + DUPLICATE KEY(c0) + PARTITION BY RANGE(c1) ( + FROM ("2024-09-01") TO ("2024-09-05") INTERVAL 1 DAY + ) + DISTRIBUTED BY HASH(c0) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + label = "test_ingestion_load_alter_partition_3" + + testIngestLoadJob.call(tableName3, label, [context.config.dataPath + '/load_p0/ingestion_load/data2-0.parquet', context.config.dataPath + '/load_p0/ingestion_load/data2-1.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-2.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-3.parquet'], { + sql "alter table ${tableName3} add temporary partition tp_20240901 VALUES [('2024-09-01'), ('2024-09-02'))" + sql "alter table ${tableName3} replace partition(p_20240901) with temporary partition(tp_20240901)" + }) + + } finally { + sql "DROP TABLE ${tableName1}" + sql "DROP TABLE ${tableName2}" + sql "DROP TABLE ${tableName3}" + } + + } + +} \ No newline at end of file diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_drop_table.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_drop_table.groovy new file mode 100644 index 00000000000000..4f245c3d535b15 --- /dev/null +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_drop_table.groovy @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import java.nio.file.Files +import java.nio.file.Paths +import java.nio.file.StandardCopyOption + +suite('test_ingestion_load_drop_table', 'p0') { + + def testIngestLoadJob = { testTable, loadLabel, dataFile, alterAction -> + + sql "TRUNCATE TABLE ${testTable}" + + sql "CLEAN LABEL FROM ${context.dbName}" + + Integer loadId = -1 + Integer tableId = -1 + Integer partitionId = -1 + Integer indexId = -1 + Integer bucketId = 0 + Integer schemaHash = -1 + + String reqBody = + """{ + "label": "${loadLabel}", + "tableToPartition": { + "${testTable}": [] + }, + "properties": {} + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_create" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body reqBody + check { code, resBody -> + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + def data = resBodyJson.data + loadId = data.loadId + def tableMeta = data.tableMeta + tableId = tableMeta["${testTable}"].id + def index = tableMeta["${testTable}"].indexes[0] + indexId = index.indexId + schemaHash = index.schemaHash + partitionId = tableMeta["${testTable}"].partitionInfo.partitions[0].partitionId + } + } + + String resultFileName = "V1.${loadLabel}.${tableId}.${partitionId}.${indexId}.${bucketId}.${schemaHash}.parquet" + logger.info("resultFileName: " + resultFileName) + + Files.copy(Paths.get(dataFile), + Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileName}"), StandardCopyOption.REPLACE_EXISTING) + + String etlResultFilePath = uploadToHdfs "/load_p0/ingestion_load/${resultFileName}" + + String dppResult = '{\\"isSuccess\\":true,\\"failedReason\\":\\"\\",\\"scannedRows\\":10,\\"fileNumber\\":1,' + + '\\"fileSize\\":2441,\\"normalRows\\":10,\\"abnormalRows\\":0,\\"unselectRows\\":0,' + + '\\"partialAbnormalRows\\":\\"[]\\",\\"scannedBytes\\":0}' + + String updateStatusReqBody = + """{ + "loadId": ${loadId}, + "statusInfo": { + "status": "SUCCESS", + "msg": "", + "appId": "", + "dppResult": "${dppResult}", + "filePathToSize": "{\\"${etlResultFilePath}\\": 81758}", + "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" + } + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_update" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body updateStatusReqBody + check { code, resBody -> + { + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + } + } + } + + alterAction.call() + + max_try_milli_secs = 120000 + while (max_try_milli_secs) { + result = sql "show load where label = '${loadLabel}'" + if (result.size() == 0) { + break + } else { + sleep(5000) // wait 1 second every time + max_try_milli_secs -= 5000 + if (max_try_milli_secs <= 0) { + assertEquals(1, 2) + } + } + } + + } + + if (enableHdfs()) { + + tableName = 'tbl_test_spark_load_drop_table' + + try { + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL + ) + DUPLICATE KEY(c_int) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + label = "test_ingestion_load_drop_table" + + testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet', { + sql "DROP TABLE ${tableName}" + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL + ) + DUPLICATE KEY(c_int) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + }) + + } finally { + sql "DROP TABLE ${tableName}" + } + + } + +} \ No newline at end of file diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_multi_table.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_multi_table.groovy new file mode 100644 index 00000000000000..67455d8c692cd3 --- /dev/null +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_multi_table.groovy @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import java.nio.file.Files +import java.nio.file.Paths +import java.nio.file.StandardCopyOption + +suite('test_ingestion_load_multi_table', 'p0') { + + def testIngestLoadJob = { loadLabel, testTable1, testTable2, dataFile1, dataFile2 -> + + sql "TRUNCATE TABLE ${testTable1}" + sql "TRUNCATE TABLE ${testTable2}" + + sql "CLEAN LABEL FROM ${context.dbName}" + + Integer loadId = -1 + Integer tableId = -1 + Integer partitionId = -1 + Integer indexId = -1 + Integer bucketId = 0 + Integer schemaHash = -1 + + String resultFileName1 = "" + String resultFileName2 = "" + + String reqBody = + """{ + "label": "${loadLabel}", + "tableToPartition": { + "${testTable1}": [], + "${testTable2}": [] + }, + "properties": {} + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_create" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body reqBody + check { code, resBody -> + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + def data = resBodyJson.data + loadId = data.loadId + def tableMeta = data.tableMeta + // table1 + tableId = tableMeta["${testTable1}"].id + def index1 = tableMeta["${testTable1}"].indexes[0] + indexId = index1.indexId + schemaHash = index1.schemaHash + partitionId = tableMeta["${testTable1}"].partitionInfo.partitions[0].partitionId + resultFileName1 = "V1.${loadLabel}.${tableId}.${partitionId}.${indexId}.${bucketId}.${schemaHash}.parquet" + // table2 + tableId = tableMeta["${testTable2}"].id + def index2 = tableMeta["${testTable2}"].indexes[0] + indexId = index2.indexId + schemaHash = index2.schemaHash + partitionId = tableMeta["${testTable2}"].partitionInfo.partitions[0].partitionId + resultFileName2 = "V1.${loadLabel}.${tableId}.${partitionId}.${indexId}.${bucketId}.${schemaHash}.parquet" + } + } + + logger.info("resultFileName1: " + resultFileName1) + logger.info("resultFileName2: " + resultFileName2) + + Files.copy(Paths.get(dataFile1), + Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileName1}"), StandardCopyOption.REPLACE_EXISTING) + Files.copy(Paths.get(dataFile2), + Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileName2}"), StandardCopyOption.REPLACE_EXISTING) + + String etlResultFilePath1 = uploadToHdfs "/load_p0/ingestion_load/${resultFileName1}" + String etlResultFilePath2 = uploadToHdfs "/load_p0/ingestion_load/${resultFileName2}" + + String dppResult = '{\\"isSuccess\\":true,\\"failedReason\\":\\"\\",\\"scannedRows\\":10,\\"fileNumber\\":2,' + + '\\"fileSize\\":163516,\\"normalRows\\":10,\\"abnormalRows\\":0,\\"unselectRows\\":0,' + + '\\"partialAbnormalRows\\":\\"[]\\",\\"scannedBytes\\":0}' + + + String updateStatusReqBody = + """{ + "loadId": ${loadId}, + "statusInfo": { + "status": "SUCCESS", + "msg": "", + "appId": "", + "dppResult": "${dppResult}", + "filePathToSize": "{\\"${etlResultFilePath1}\\": 81758, \\"${etlResultFilePath2}\\": 81758}", + "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" + } + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_update" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body updateStatusReqBody + check { code, resBody -> + { + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + } + } + } + + max_try_milli_secs = 60000 + while (max_try_milli_secs) { + result = sql "show load where label = '${loadLabel}'" + if (result[0][2] == "FINISHED") { + sql "sync" + qt_select "select * from ${testTable1} order by c_int" + qt_select "select * from ${testTable2} order by c_int" + break + } else { + sleep(5000) // wait 1 second every time + max_try_milli_secs -= 5000 + if (max_try_milli_secs <= 0) { + assertEquals(1, 2) + } + } + } + + } + + if (enableHdfs()) { + + tableName1 = 'tbl_test_spark_load_multi_1' + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName1} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL + ) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + tableName2 = 'tbl_test_spark_load_multi_2' + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName2} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL + ) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + def label = "test_ingestion_load_multi_table" + + testIngestLoadJob.call(label, tableName1, tableName2, context.config.dataPath + '/load_p0/ingestion_load/data.parquet', context.config.dataPath + '/load_p0/ingestion_load/data.parquet') + + } + +} \ No newline at end of file diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.groovy new file mode 100644 index 00000000000000..7eed4bfdc58342 --- /dev/null +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_inverted_index.groovy @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import java.nio.file.Files +import java.nio.file.Paths +import java.nio.file.StandardCopyOption + +suite('test_ingestion_load_with_inverted_index', 'p0') { + + def testIngestLoadJob = { testTable, loadLabel, String dataFile -> + + sql "TRUNCATE TABLE ${testTable}" + + sql "CLEAN LABEL FROM ${context.dbName}" + + Integer loadId = -1 + Integer tableId = -1 + Integer partitionId = -1 + Integer indexId = -1 + Integer bucketId = 0 + Integer schemaHash = -1 + + String reqBody = + """{ + "label": "${loadLabel}", + "tableToPartition": { + "${testTable}": [] + }, + "properties": {} + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_create" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body reqBody + check { code, resBody -> + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + def data = resBodyJson.data + loadId = data.loadId + def tableMeta = data.tableMeta + tableId = tableMeta["${testTable}"].id + def index = tableMeta["${testTable}"].indexes[0] + indexId = index.indexId + schemaHash = index.schemaHash + partitionId = tableMeta["${testTable}"].partitionInfo.partitions[0].partitionId + } + } + + String resultFileName = "V1.${loadLabel}.${tableId}.${partitionId}.${indexId}.${bucketId}.${schemaHash}.parquet" + logger.info("resultFileName: " + resultFileName) + + Files.copy(Paths.get(dataFile), + Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileName}"), StandardCopyOption.REPLACE_EXISTING) + + String etlResultFilePath = uploadToHdfs "/load_p0/ingestion_load/${resultFileName}" + + String dppResult = '{\\"isSuccess\\":true,\\"failedReason\\":\\"\\",\\"scannedRows\\":10,\\"fileNumber\\":1,' + + '\\"fileSize\\":2441,\\"normalRows\\":10,\\"abnormalRows\\":0,\\"unselectRows\\":0,' + + '\\"partialAbnormalRows\\":\\"[]\\",\\"scannedBytes\\":0}' + + String updateStatusReqBody = + """{ + "loadId": ${loadId}, + "statusInfo": { + "status": "SUCCESS", + "msg": "", + "appId": "", + "dppResult": "${dppResult}", + "filePathToSize": "{\\"${etlResultFilePath}\\": 81758}", + "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" + } + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_update" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body updateStatusReqBody + check { code, resBody -> + { + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + } + } + } + + def max_try_milli_secs = 120000 + while (max_try_milli_secs) { + def result = sql "show load where label = '${loadLabel}'" + if (result[0][2] == "FINISHED") { + sql "sync" + qt_select "select * from ${testTable} order by 1" + break + } else { + sleep(5000) // wait 1 second every time + max_try_milli_secs -= 5000 + if (max_try_milli_secs <= 0) { + assertEquals(1, 2) + } + } + } + + } + + if (enableHdfs()) { + + def tableName = 'test_ingestion_load_with_inverted_index' + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + c_int int(11) NULL, + c_char char(15) NULL, + c_varchar varchar(100) NULL, + c_bool boolean NULL, + c_tinyint tinyint(4) NULL, + c_smallint smallint(6) NULL, + c_bigint bigint(20) NULL, + c_largeint largeint(40) NULL, + c_float float NULL, + c_double double NULL, + c_decimal decimal(6, 3) NULL, + c_decimalv3 decimal(6, 3) NULL, + c_date date NULL, + c_datev2 date NULL, + c_datetime datetime NULL, + c_datetimev2 datetime NULL, + INDEX idx_c_varchar(c_varchar) USING INVERTED, + INDEX idx_c_bigint(c_bigint) USING INVERTED, + INDEX idx_c_datetimev2(c_datetimev2) USING INVERTED + ) + DUPLICATE KEY(c_int) + DISTRIBUTED BY HASH(c_int) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + def label = "test_ingestion_load_with_inverted_index" + + testIngestLoadJob.call(tableName, label, context.config.dataPath + '/load_p0/ingestion_load/data.parquet') + + } + +} diff --git a/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_partition.groovy b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_partition.groovy new file mode 100644 index 00000000000000..97ebb7a0761067 --- /dev/null +++ b/regression-test/suites/load_p0/ingestion_load/test_ingestion_load_with_partition.groovy @@ -0,0 +1,160 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import java.nio.file.Files +import java.nio.file.Paths +import java.nio.file.StandardCopyOption + +suite('test_ingestion_load_with_partition', 'p0') { + + def testIngestLoadJob = { testTable, loadLabel, dataFiles -> + + sql "TRUNCATE TABLE ${testTable}" + + sql "CLEAN LABEL FROM ${context.dbName}" + + Integer loadId = -1 + Integer tableId = -1 + Integer partitionId = -1 + Integer indexId = -1 + Integer bucketId = 0 + Integer schemaHash = -1 + + String reqBody = + """{ + "label": "${loadLabel}", + "tableToPartition": { + "${testTable}": [] + }, + "properties": {} + }""" + + resultFileNames = [] + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_create" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body reqBody + check { code, resBody -> + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + def data = resBodyJson.data + loadId = data.loadId + def tableMeta = data.tableMeta + tableId = tableMeta["${testTable}"].id + def index = tableMeta["${testTable}"].indexes[0] + indexId = index.indexId + schemaHash = index.schemaHash + partitions = tableMeta["${testTable}"].partitionInfo.partitions + for(partition in partitions) { + logger.info("partitionId: " + partition.partitionId) + resultFileNames.add("V1.${loadLabel}.${tableId}.${partition.partitionId}.${indexId}.${bucketId}.${schemaHash}.parquet") + } + } + } + + etlResultFilePaths = [] + for(int i=0; i < dataFiles.size(); i++) { + Files.copy(Paths.get(dataFiles[i]), + Paths.get(context.config.dataPath + "/load_p0/ingestion_load/${resultFileNames[i]}"), StandardCopyOption.REPLACE_EXISTING) + String etlResultFilePath = uploadToHdfs "/load_p0/ingestion_load/${resultFileNames[i]}" + logger.info("etlResultFilePath: " + etlResultFilePath) + etlResultFilePaths.add(etlResultFilePath) + } + + String dppResult = '{\\"isSuccess\\":true,\\"failedReason\\":\\"\\",\\"scannedRows\\":10,\\"fileNumber\\":1,' + + '\\"fileSize\\":2441,\\"normalRows\\":10,\\"abnormalRows\\":0,\\"unselectRows\\":0,' + + '\\"partialAbnormalRows\\":\\"[]\\",\\"scannedBytes\\":0}' + + String updateStatusReqBody = + """{ + "loadId": ${loadId}, + "statusInfo": { + "status": "SUCCESS", + "msg": "", + "appId": "", + "dppResult": "${dppResult}", + "filePathToSize": "{\\"${etlResultFilePaths.get(0)}\\":851,\\"${etlResultFilePaths.get(1)}\\":781,\\"${etlResultFilePaths.get(2)}\\":781,\\"${etlResultFilePaths.get(3)}\\":839}", + "hadoopProperties": "{\\"fs.defaultFS\\":\\"${getHdfsFs()}\\",\\"hadoop.username\\":\\"${getHdfsUser()}\\",\\"hadoop.password\\":\\"${getHdfsPasswd()}\\"}" + } + }""" + + httpTest { + endpoint context.config.feHttpAddress + uri "/api/ingestion_load/internal/${context.dbName}/_update" + op "post" + basicAuthorization context.config.feHttpUser, context.config.feHttpPassword + body updateStatusReqBody + check { code, resBody -> + { + assert code == 200 + def resBodyJson = parseJson(resBody) + assert resBodyJson instanceof Map + assert resBodyJson.code == 0 + } + } + } + + max_try_milli_secs = 120000 + while (max_try_milli_secs) { + result = sql "show load where label = '${loadLabel}'" + if (result[0][2] == "FINISHED") { + sql "sync" + qt_select "select c1, count(*) from ${testTable} group by c1 order by c1" + break + } else { + sleep(5000) // wait 1 second every time + max_try_milli_secs -= 5000 + if (max_try_milli_secs <= 0) { + assertEquals(1, 2) + } + } + } + + } + + if (enableHdfs()) { + + tableName = 'tbl_test_spark_load_partition' + + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + c0 int not null, + c1 date, + c2 varchar(64) + ) + DUPLICATE KEY(c0) + PARTITION BY RANGE(c1) ( + FROM ("2024-09-01") TO ("2024-09-05") INTERVAL 1 DAY + ) + DISTRIBUTED BY HASH(c0) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + def label = "test_ingestion_load_partition" + + testIngestLoadJob.call(tableName, label, [context.config.dataPath + '/load_p0/ingestion_load/data2-0.parquet', context.config.dataPath + '/load_p0/ingestion_load/data2-1.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-2.parquet',context.config.dataPath + '/load_p0/ingestion_load/data2-3.parquet']) + + } + +} \ No newline at end of file