Skip to content

Commit 3d7d82a

Browse files
authored
HIVE-25043: Addendum: Support custom UDF in Vectorized mode (#5703)
1 parent b16af6c commit 3d7d82a

File tree

8 files changed

+362
-20
lines changed

8 files changed

+362
-20
lines changed

common/src/java/org/apache/hadoop/hive/conf/HiveConf.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4702,9 +4702,9 @@ public static enum ConfVars {
47024702
"The default value is true."),
47034703

47044704
HIVE_VECTOR_ADAPTOR_CUSTOM_UDF_WHITELIST("hive.vectorized.adaptor.custom.udf.whitelist", "",
4705-
"Custom UDF allowed when hive.vectorized.adaptor.usage.mode is chosen.\n" +
4706-
"Specify classes separated by commas:\n" +
4707-
"package.FooClass,package.BarClass"),
4705+
"A comma-separated list of custom UDFs allowed to operate in vectorized mode " +
4706+
"when hive.vectorized.adaptor.usage.mode is set to chosen.\n" +
4707+
"Only Generic UDFs are supported for whitelisting; ensure that each custom UDF class extends GenericUDF"),
47084708

47094709
HIVE_VECTORIZATION_PTF_MAX_MEMORY_BUFFERING_BATCH_COUNT("hive.vectorized.ptf.max.memory.buffering.batch.count", 25,
47104710
"Maximum number of vectorized row batches to buffer in memory for PTF\n" +

data/conf/llap-udfs.lst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org.apache.hadoop.hive.ql.udf.generic.GenericUDFCustomDateSub

itests/src/test/resources/testconfiguration.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ minillap.query.files=\
7171
cte_2.q,\
7272
cte_4.q,\
7373
cttl.q,\
74+
custom_udf_vectorization.q,\
7475
dynamic_partition_pruning_2.q,\
7576
dynamic_semijoin_user_level.q,\
7677
dynpart_cast.q,\
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hive.ql.udf.generic;
19+
20+
import org.apache.hadoop.hive.ql.exec.Description;
21+
import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
22+
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDateSubColCol;
23+
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDateSubColScalar;
24+
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorUDFDateSubScalarCol;
25+
26+
/**
27+
* UDFDateSub.
28+
*
29+
* Subtract a number of days to the date. The time part of the string will be
30+
* ignored.
31+
*
32+
* NOTE: This is a subset of what MySQL offers as:
33+
* http://dev.mysql.com/doc/refman
34+
* /5.1/en/date-and-time-functions.html#function_date-sub
35+
*
36+
*/
37+
@Description(name = "date_sub",
38+
value = "_FUNC_(start_date, num_days) - Returns the date that is num_days before start_date.",
39+
extended = "start_date is a string in the format 'yyyy-MM-dd HH:mm:ss' or"
40+
+ " 'yyyy-MM-dd'. num_days is a number. The time part of start_date is "
41+
+ "ignored.\n"
42+
+ "Example:\n "
43+
+ " > SELECT _FUNC_('2009-07-30', 1) FROM src LIMIT 1;\n"
44+
+ " '2009-07-29'")
45+
@VectorizedExpressions({VectorUDFDateSubColScalar.class, VectorUDFDateSubScalarCol.class, VectorUDFDateSubColCol.class})
46+
public class GenericUDFCustomDateSub extends GenericUDFDateAdd {
47+
public GenericUDFCustomDateSub() {
48+
this.signModifier = -1;
49+
}
50+
51+
@Override
52+
public String getDisplayString(String[] children) {
53+
return getStandardDisplayString("date_sub", children);
54+
}
55+
}

ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -277,18 +277,6 @@ public static HiveVectorIfStmtMode getHiveConfValue(HiveConf hiveConf) {
277277

278278
private HiveVectorIfStmtMode hiveVectorIfStmtMode;
279279

280-
private Set<String> allowedCustomUDFs;
281-
282-
private Set<String> getAllowedCustomUDFs(HiveConf hiveConf) {
283-
String udfs = HiveConf.getVar(hiveConf,
284-
HiveConf.ConfVars.HIVE_VECTOR_ADAPTOR_CUSTOM_UDF_WHITELIST);
285-
if (udfs != null && !udfs.isEmpty()) {
286-
return new HashSet<>(Arrays.asList(udfs.split(",")));
287-
}
288-
289-
return new HashSet<>();
290-
}
291-
292280
//when set to true use the overflow checked vector expressions
293281
private boolean useCheckedVectorExpressions;
294282

@@ -310,7 +298,6 @@ private void setHiveConfVars(HiveConf hiveConf) {
310298
adaptorSuppressEvaluateExceptions =
311299
HiveConf.getBoolVar(
312300
hiveConf, HiveConf.ConfVars.HIVE_VECTORIZED_ADAPTOR_SUPPRESS_EVALUATE_EXCEPTIONS);
313-
this.allowedCustomUDFs = getAllowedCustomUDFs(hiveConf);
314301
}
315302

316303
private void copyHiveConfVars(VectorizationContext vContextEnvironment) {
@@ -1050,7 +1037,7 @@ public VectorExpression getVectorExpression(ExprNodeDesc exprDesc, VectorExpress
10501037
"Could not vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString()
10511038
+ " because hive.vectorized.adaptor.usage.mode=none");
10521039
case CHOSEN:
1053-
if (isNonVectorizedPathUDF(expr, mode, allowedCustomUDFs)) {
1040+
if (isNonVectorizedPathUDF(expr, mode)) {
10541041
ve = getCustomUDFExpression(expr, mode);
10551042
} else {
10561043
throw new HiveException(
@@ -1460,7 +1447,7 @@ public static GenericUDF getGenericUDFForCast(TypeInfo castType) throws HiveExce
14601447
* may be implemented in the future with an optimized VectorExpression.
14611448
*/
14621449
private static boolean isNonVectorizedPathUDF(ExprNodeGenericFuncDesc expr,
1463-
VectorExpressionDescriptor.Mode mode, Set<String> allowCustomUDFs) {
1450+
VectorExpressionDescriptor.Mode mode) {
14641451
GenericUDF gudf = expr.getGenericUDF();
14651452
if (gudf instanceof GenericUDFBridge) {
14661453
GenericUDFBridge bridge = (GenericUDFBridge) gudf;
@@ -1498,8 +1485,6 @@ private static boolean isNonVectorizedPathUDF(ExprNodeGenericFuncDesc expr,
14981485
return true;
14991486
} else if (gudf instanceof GenericUDFConcat && (mode == VectorExpressionDescriptor.Mode.PROJECTION)) {
15001487
return true;
1501-
} else if (allowCustomUDFs.contains(gudf.getClass().getName())) {
1502-
return true;
15031488
}
15041489
return false;
15051490
}

ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@
178178
import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef;
179179
import org.apache.hadoop.hive.ql.plan.ptf.WindowFunctionDef;
180180
import org.apache.hadoop.hive.ql.plan.ptf.WindowTableFunctionDef;
181+
import org.apache.hadoop.hive.ql.session.SessionState;
181182
import org.apache.hadoop.hive.ql.udf.UDFAcos;
182183
import org.apache.hadoop.hive.ql.udf.UDFAsin;
183184
import org.apache.hadoop.hive.ql.udf.UDFAtan;
@@ -550,6 +551,9 @@ public Vectorizer() {
550551

551552
// For conditional expressions
552553
supportedGenericUDFs.add(GenericUDFIf.class);
554+
555+
// Add user custom UDFs
556+
addCustomUDFs(SessionState.getSessionConf());
553557
}
554558

555559
private class VectorTaskColumnInfo {
@@ -2411,6 +2415,40 @@ private boolean getOnlyStructObjectInspectors(ReduceWork reduceWork,
24112415
}
24122416
}
24132417

2418+
private void addCustomUDFs(HiveConf hiveConf) {
2419+
if (hiveConf == null) {
2420+
return;
2421+
}
2422+
2423+
if (HiveVectorAdaptorUsageMode.CHOSEN !=
2424+
HiveVectorAdaptorUsageMode.getHiveConfValue(hiveConf)) {
2425+
return;
2426+
}
2427+
2428+
String[] udfs =
2429+
HiveConf.getTrimmedStringsVar(hiveConf,
2430+
HiveConf.ConfVars.HIVE_VECTOR_ADAPTOR_CUSTOM_UDF_WHITELIST);
2431+
if (udfs == null) {
2432+
return;
2433+
}
2434+
2435+
ClassLoader loader = Utilities.getSessionSpecifiedClassLoader();
2436+
2437+
for (String udf : udfs) {
2438+
try {
2439+
Class<?> cls = Class.forName(udf, true, loader);
2440+
if (GenericUDF.class.isAssignableFrom(cls)) {
2441+
supportedGenericUDFs.add(cls);
2442+
LOG.info("Registered custom UDF: {}", udf);
2443+
} else {
2444+
LOG.warn("{} must inherit from the GenericUDF", udf);
2445+
}
2446+
} catch (ClassNotFoundException e) {
2447+
LOG.warn("Failed to register custom UDF: {}", udf, e);
2448+
}
2449+
}
2450+
}
2451+
24142452
@Override
24152453
public PhysicalContext resolve(PhysicalContext physicalContext) throws SemanticException {
24162454

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
--! qt:dataset:src
2+
set hive.mapred.mode=nonstrict;
3+
set hive.explain.user=false;
4+
set hive.vectorized.execution.enabled=true;
5+
set hive.vectorized.adaptor.usage.mode=chosen;
6+
set hive.llap.execution.mode=auto;
7+
8+
CREATE TEMPORARY FUNCTION CDATE_SUB AS 'org.apache.hadoop.hive.ql.udf.generic.GenericUDFCustomDateSub';
9+
10+
set hive.vectorized.adaptor.custom.udf.whitelist=org.apache.hadoop.hive.ql.udf.generic.GenericUDFCustomDateSub;
11+
12+
EXPLAIN VECTORIZATION
13+
SELECT CDATE_SUB('2000-01-01', 1) FROM src
14+
UNION ALL
15+
SELECT CDATE_SUB('2000-01-01', CAST(key as int)) FROM src
16+
UNION ALL
17+
SELECT CDATE_SUB('2000-01-01', 1) FROM src WHERE key IS NOT NULL
18+
UNION ALL
19+
SELECT CDATE_SUB('2000-01-01', CAST(key as int)) FROM src WHERE key Is NOT NULL
20+
UNION ALL
21+
SELECT CDATE_SUB('2000-01-01', 1) FROM src WHERE key <> '' AND value <> ''
22+
UNION ALL
23+
SELECT CDATE_SUB('2000-01-01', CAST(key as int)) FROM src WHERE key <> '' AND value <> '';
24+

0 commit comments

Comments
 (0)