use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class TestOrcSplitElimination method testSplitEliminationLargeMaxSplit.
@Test
public void testSplitEliminationLargeMaxSplit() throws Exception {
ObjectInspector inspector = createIO();
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
writeData(writer);
writer.close();
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 150000);
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
GenericUDF udf = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr = Lists.newArrayList();
ExprNodeConstantDesc con;
ExprNodeGenericFuncDesc en;
String sargStr;
createTestSarg(inspector, udf, childExpr);
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(0);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// no stripes satisfies the condition
assertEquals(0, splits.length);
con = new ExprNodeConstantDesc(2);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// only first stripe will satisfy condition and hence single split
assertEquals(1, splits.length);
con = new ExprNodeConstantDesc(5);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first stripe will satisfy the predicate and will be a single split, last stripe will be a
// separate split
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(13);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first 2 stripes will satisfy the predicate and merged to single split, last stripe will be a
// separate split
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(29);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first 3 stripes will satisfy the predicate and merged to single split, last stripe will be a
// separate split
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(70);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first 2 stripes will satisfy the predicate and merged to single split, last two stripe will
// be a separate split
assertEquals(2, splits.length);
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class TestVectorUDFAdaptor method testGenericUDF.
// test the UDF adaptor for a generic UDF (as opposed to a legacy UDF)
@Test
public void testGenericUDF() {
// create a syntax tree for a function call 'myisnull(col0, "UNKNOWN")'
ExprNodeGenericFuncDesc funcDesc;
GenericUDF genericUDF = new GenericUDFIsNull();
TypeInfo typeInfoStr = TypeInfoFactory.stringTypeInfo;
List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
children.add(new ExprNodeColumnDesc(typeInfoStr, "col0", "tablename", false));
children.add(new ExprNodeConstantDesc(typeInfoStr, "UNKNOWN"));
VectorUDFArgDesc[] argDescs = new VectorUDFArgDesc[2];
for (int i = 0; i < 2; i++) {
argDescs[i] = new VectorUDFArgDesc();
}
argDescs[0].setVariable(0);
argDescs[1].setConstant((ExprNodeConstantDesc) children.get(1));
funcDesc = new ExprNodeGenericFuncDesc(typeInfoStr, genericUDF, "myisnull", children);
// create the adaptor for this function call to work in vector mode
VectorUDFAdaptor vudf = null;
try {
vudf = new VectorUDFAdaptor(funcDesc, 3, "String", argDescs);
} catch (HiveException e) {
// We should never get here.
assertTrue(false);
}
VectorizedRowBatch b;
byte[] red = null;
byte[] unknown = null;
try {
red = "red".getBytes("UTF-8");
unknown = "UNKNOWN".getBytes("UTF-8");
} catch (Exception e) {
;
}
BytesColumnVector out;
// with nulls
b = getBatchStrDblLongWithStrOut();
b.cols[0].noNulls = false;
// set 1st entry to null
b.cols[0].isNull[0] = true;
vudf.evaluate(b);
out = (BytesColumnVector) b.cols[3];
// verify outputs
int cmp = StringExpr.compare(red, 0, red.length, out.vector[1], out.start[1], out.length[1]);
assertEquals(0, cmp);
cmp = StringExpr.compare(unknown, 0, unknown.length, out.vector[0], out.start[0], out.length[0]);
assertEquals(0, cmp);
// output entry should not be null for null input for this particular generic UDF
assertTrue(out.noNulls || !out.isNull[0]);
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class IndexPredicateAnalyzer method analyzeExpr.
private ExprNodeDesc analyzeExpr(ExprNodeGenericFuncDesc expr, List<IndexSearchCondition> searchConditions, Object... nodeOutputs) throws SemanticException {
if (FunctionRegistry.isOpAnd(expr)) {
assert (nodeOutputs.length >= 2);
List<ExprNodeDesc> residuals = new ArrayList<ExprNodeDesc>();
for (Object residual : nodeOutputs) {
if (null != residual) {
residuals.add((ExprNodeDesc) residual);
}
}
if (residuals.size() == 0) {
return null;
} else if (residuals.size() == 1) {
return residuals.get(0);
} else if (residuals.size() > 1) {
return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, FunctionRegistry.getGenericUDFForAnd(), residuals);
}
}
GenericUDF genericUDF = expr.getGenericUDF();
if (!(genericUDF instanceof GenericUDFBaseCompare)) {
return expr;
}
ExprNodeDesc expr1 = (ExprNodeDesc) nodeOutputs[0];
ExprNodeDesc expr2 = (ExprNodeDesc) nodeOutputs[1];
// We may need to peel off the GenericUDFBridge that is added by CBO or user
if (expr1.getTypeInfo().equals(expr2.getTypeInfo())) {
expr1 = getColumnExpr(expr1);
expr2 = getColumnExpr(expr2);
}
ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair(expr1, expr2);
if (extracted == null || (extracted.length > 2 && !acceptsFields)) {
return expr;
}
ExprNodeColumnDesc columnDesc;
ExprNodeConstantDesc constantDesc;
if (extracted[0] instanceof ExprNodeConstantDesc) {
genericUDF = genericUDF.flip();
columnDesc = (ExprNodeColumnDesc) extracted[1];
constantDesc = (ExprNodeConstantDesc) extracted[0];
} else {
columnDesc = (ExprNodeColumnDesc) extracted[0];
constantDesc = (ExprNodeConstantDesc) extracted[1];
}
Set<String> allowed = columnToUDFs.get(columnDesc.getColumn());
if (allowed == null) {
return expr;
}
String udfName = genericUDF.getUdfName();
if (!allowed.contains(genericUDF.getUdfName())) {
return expr;
}
String[] fields = null;
if (extracted.length > 2) {
ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) extracted[2];
if (!isValidField(fieldDesc)) {
return expr;
}
fields = ExprNodeDescUtils.extractFields(fieldDesc);
}
// We also need to update the expr so that the index query can be generated.
// Note that, hive does not support UDFToDouble etc in the query text.
List<ExprNodeDesc> list = new ArrayList<ExprNodeDesc>();
list.add(expr1);
list.add(expr2);
ExprNodeGenericFuncDesc indexExpr = new ExprNodeGenericFuncDesc(expr.getTypeInfo(), expr.getGenericUDF(), list);
searchConditions.add(new IndexSearchCondition(columnDesc, udfName, constantDesc, indexExpr, expr, fields));
// remove it from the residual predicate
return fields == null ? null : expr;
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class ExprNodeConverter method visitCall.
@Override
public ExprNodeDesc visitCall(RexCall call) {
ExprNodeDesc gfDesc = null;
if (!deep) {
return null;
}
List<ExprNodeDesc> args = new LinkedList<ExprNodeDesc>();
if (call.getKind() == SqlKind.EXTRACT) {
// Extract on date: special handling since function in Hive does
// include <time_unit>. Observe that <time_unit> information
// is implicit in the function name, thus translation will
// proceed correctly if we just ignore the <time_unit>
args.add(call.operands.get(1).accept(this));
} else if (call.getKind() == SqlKind.FLOOR && call.operands.size() == 2) {
// Floor on date: special handling since function in Hive does
// include <time_unit>. Observe that <time_unit> information
// is implicit in the function name, thus translation will
// proceed correctly if we just ignore the <time_unit>
args.add(call.operands.get(0).accept(this));
} else {
for (RexNode operand : call.operands) {
args.add(operand.accept(this));
}
}
// If Call is a redundant cast then bail out. Ex: cast(true)BOOLEAN
if (call.isA(SqlKind.CAST) && (call.operands.size() == 1) && SqlTypeUtil.equalSansNullability(dTFactory, call.getType(), call.operands.get(0).getType())) {
return args.get(0);
} else {
GenericUDF hiveUdf = SqlFunctionConverter.getHiveUDF(call.getOperator(), call.getType(), args.size());
if (hiveUdf == null) {
throw new RuntimeException("Cannot find UDF for " + call.getType() + " " + call.getOperator() + "[" + call.getOperator().getKind() + "]/" + args.size());
}
try {
gfDesc = ExprNodeGenericFuncDesc.newInstance(hiveUdf, args);
} catch (UDFArgumentException e) {
LOG.error("Failed to instantiate udf: ", e);
throw new RuntimeException(e);
}
}
// Try to fold if it is a constant expression
if (foldExpr && RexUtil.isConstant(call)) {
ExprNodeDesc constantExpr = ConstantPropagateProcFactory.foldExpr((ExprNodeGenericFuncDesc) gfDesc);
if (constantExpr != null) {
gfDesc = constantExpr;
}
}
return gfDesc;
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class TestVectorizationContext method testInBloomFilter.
@Test
public void testInBloomFilter() throws Exception {
// Setup InBloomFilter() UDF
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(TypeInfoFactory.getDecimalTypeInfo(10, 5), "a", "table", false);
ExprNodeDesc bfExpr = new ExprNodeDynamicValueDesc(new DynamicValue("id1", TypeInfoFactory.binaryTypeInfo));
ExprNodeGenericFuncDesc inBloomFilterExpr = new ExprNodeGenericFuncDesc();
GenericUDF inBloomFilterUdf = new GenericUDFInBloomFilter();
inBloomFilterExpr.setTypeInfo(TypeInfoFactory.booleanTypeInfo);
inBloomFilterExpr.setGenericUDF(inBloomFilterUdf);
List<ExprNodeDesc> children1 = new ArrayList<ExprNodeDesc>(2);
children1.add(colExpr);
children1.add(bfExpr);
inBloomFilterExpr.setChildren(children1);
// Setup VectorizationContext
List<String> columns = new ArrayList<String>();
columns.add("b");
columns.add("a");
VectorizationContext vc = new VectorizationContext("name", columns);
// Create vectorized expr
VectorExpression ve = vc.getVectorExpression(inBloomFilterExpr, VectorExpressionDescriptor.Mode.FILTER);
Assert.assertEquals(VectorInBloomFilterColDynamicValue.class, ve.getClass());
VectorInBloomFilterColDynamicValue vectorizedInBloomFilterExpr = (VectorInBloomFilterColDynamicValue) ve;
VectorExpression[] children = vectorizedInBloomFilterExpr.getChildExpressions();
// VectorInBloomFilterColDynamicValue should have all of the necessary information to vectorize.
// Should be no need for child vector expressions, which would imply casting/conversion.
Assert.assertNull(children);
}
Aggregations