Examples with GenericUDF - org.apache.hadoop.hive.ql.udf.generic.GenericUDF

Example 16 with GenericUDF

use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.

the class TestOrcSplitElimination method testSplitEliminationLargeMaxSplit.

@Test
public void testSplitEliminationLargeMaxSplit() throws Exception {
    ObjectInspector inspector = createIO();
    Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
    writeData(writer);
    writer.close();
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 150000);
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    GenericUDF udf = new GenericUDFOPEqualOrLessThan();
    List<ExprNodeDesc> childExpr = Lists.newArrayList();
    ExprNodeConstantDesc con;
    ExprNodeGenericFuncDesc en;
    String sargStr;
    createTestSarg(inspector, udf, childExpr);
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(0);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // no stripes satisfies the condition
    assertEquals(0, splits.length);
    con = new ExprNodeConstantDesc(2);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // only first stripe will satisfy condition and hence single split
    assertEquals(1, splits.length);
    con = new ExprNodeConstantDesc(5);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first stripe will satisfy the predicate and will be a single split, last stripe will be a
    // separate split
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(13);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first 2 stripes will satisfy the predicate and merged to single split, last stripe will be a
    // separate split
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(29);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first 3 stripes will satisfy the predicate and merged to single split, last stripe will be a
    // separate split
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(70);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first 2 stripes will satisfy the predicate and merged to single split, last two stripe will
    // be a separate split
    assertEquals(2, splits.length);
}

Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) GenericUDFOPEqualOrLessThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 17 with GenericUDF

use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.

the class TestVectorUDFAdaptor method testGenericUDF.

// test the UDF adaptor for a generic UDF (as opposed to a legacy UDF)
@Test
public void testGenericUDF() {
    // create a syntax tree for a function call 'myisnull(col0, "UNKNOWN")'
    ExprNodeGenericFuncDesc funcDesc;
    GenericUDF genericUDF = new GenericUDFIsNull();
    TypeInfo typeInfoStr = TypeInfoFactory.stringTypeInfo;
    List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
    children.add(new ExprNodeColumnDesc(typeInfoStr, "col0", "tablename", false));
    children.add(new ExprNodeConstantDesc(typeInfoStr, "UNKNOWN"));
    VectorUDFArgDesc[] argDescs = new VectorUDFArgDesc[2];
    for (int i = 0; i < 2; i++) {
        argDescs[i] = new VectorUDFArgDesc();
    }
    argDescs[0].setVariable(0);
    argDescs[1].setConstant((ExprNodeConstantDesc) children.get(1));
    funcDesc = new ExprNodeGenericFuncDesc(typeInfoStr, genericUDF, "myisnull", children);
    // create the adaptor for this function call to work in vector mode
    VectorUDFAdaptor vudf = null;
    try {
        vudf = new VectorUDFAdaptor(funcDesc, 3, "String", argDescs);
    } catch (HiveException e) {
        // We should never get here.
        assertTrue(false);
    }
    VectorizedRowBatch b;
    byte[] red = null;
    byte[] unknown = null;
    try {
        red = "red".getBytes("UTF-8");
        unknown = "UNKNOWN".getBytes("UTF-8");
    } catch (Exception e) {
        ;
    }
    BytesColumnVector out;
    // with nulls
    b = getBatchStrDblLongWithStrOut();
    b.cols[0].noNulls = false;
    // set 1st entry to null
    b.cols[0].isNull[0] = true;
    vudf.evaluate(b);
    out = (BytesColumnVector) b.cols[3];
    // verify outputs
    int cmp = StringExpr.compare(red, 0, red.length, out.vector[1], out.start[1], out.length[1]);
    assertEquals(0, cmp);
    cmp = StringExpr.compare(unknown, 0, unknown.length, out.vector[0], out.start[0], out.length[0]);
    assertEquals(0, cmp);
    // output entry should not be null for null input for this particular generic UDF
    assertTrue(out.noNulls || !out.isNull[0]);
}

Also used : ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) GenericUDFIsNull(org.apache.hadoop.hive.ql.exec.vector.udf.generic.GenericUDFIsNull) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) Test(org.junit.Test)

Example 18 with GenericUDF

use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.

the class IndexPredicateAnalyzer method analyzeExpr.

private ExprNodeDesc analyzeExpr(ExprNodeGenericFuncDesc expr, List<IndexSearchCondition> searchConditions, Object... nodeOutputs) throws SemanticException {
    if (FunctionRegistry.isOpAnd(expr)) {
        assert (nodeOutputs.length >= 2);
        List<ExprNodeDesc> residuals = new ArrayList<ExprNodeDesc>();
        for (Object residual : nodeOutputs) {
            if (null != residual) {
                residuals.add((ExprNodeDesc) residual);
            }
        }
        if (residuals.size() == 0) {
            return null;
        } else if (residuals.size() == 1) {
            return residuals.get(0);
        } else if (residuals.size() > 1) {
            return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, FunctionRegistry.getGenericUDFForAnd(), residuals);
        }
    }
    GenericUDF genericUDF = expr.getGenericUDF();
    if (!(genericUDF instanceof GenericUDFBaseCompare)) {
        return expr;
    }
    ExprNodeDesc expr1 = (ExprNodeDesc) nodeOutputs[0];
    ExprNodeDesc expr2 = (ExprNodeDesc) nodeOutputs[1];
    // We may need to peel off the GenericUDFBridge that is added by CBO or user
    if (expr1.getTypeInfo().equals(expr2.getTypeInfo())) {
        expr1 = getColumnExpr(expr1);
        expr2 = getColumnExpr(expr2);
    }
    ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair(expr1, expr2);
    if (extracted == null || (extracted.length > 2 && !acceptsFields)) {
        return expr;
    }
    ExprNodeColumnDesc columnDesc;
    ExprNodeConstantDesc constantDesc;
    if (extracted[0] instanceof ExprNodeConstantDesc) {
        genericUDF = genericUDF.flip();
        columnDesc = (ExprNodeColumnDesc) extracted[1];
        constantDesc = (ExprNodeConstantDesc) extracted[0];
    } else {
        columnDesc = (ExprNodeColumnDesc) extracted[0];
        constantDesc = (ExprNodeConstantDesc) extracted[1];
    }
    Set<String> allowed = columnToUDFs.get(columnDesc.getColumn());
    if (allowed == null) {
        return expr;
    }
    String udfName = genericUDF.getUdfName();
    if (!allowed.contains(genericUDF.getUdfName())) {
        return expr;
    }
    String[] fields = null;
    if (extracted.length > 2) {
        ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) extracted[2];
        if (!isValidField(fieldDesc)) {
            return expr;
        }
        fields = ExprNodeDescUtils.extractFields(fieldDesc);
    }
    // We also need to update the expr so that the index query can be generated.
    // Note that, hive does not support UDFToDouble etc in the query text.
    List<ExprNodeDesc> list = new ArrayList<ExprNodeDesc>();
    list.add(expr1);
    list.add(expr2);
    ExprNodeGenericFuncDesc indexExpr = new ExprNodeGenericFuncDesc(expr.getTypeInfo(), expr.getGenericUDF(), list);
    searchConditions.add(new IndexSearchCondition(columnDesc, udfName, constantDesc, indexExpr, expr, fields));
    // remove it from the residual predicate
    return fields == null ? null : expr;
}

Also used : ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) ArrayList(java.util.ArrayList) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) GenericUDFBaseCompare(org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare) ExprNodeFieldDesc(org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 19 with GenericUDF

use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.

the class ExprNodeConverter method visitCall.

@Override
public ExprNodeDesc visitCall(RexCall call) {
    ExprNodeDesc gfDesc = null;
    if (!deep) {
        return null;
    }
    List<ExprNodeDesc> args = new LinkedList<ExprNodeDesc>();
    if (call.getKind() == SqlKind.EXTRACT) {
        // Extract on date: special handling since function in Hive does
        // include <time_unit>. Observe that <time_unit> information
        // is implicit in the function name, thus translation will
        // proceed correctly if we just ignore the <time_unit>
        args.add(call.operands.get(1).accept(this));
    } else if (call.getKind() == SqlKind.FLOOR && call.operands.size() == 2) {
        // Floor on date: special handling since function in Hive does
        // include <time_unit>. Observe that <time_unit> information
        // is implicit in the function name, thus translation will
        // proceed correctly if we just ignore the <time_unit>
        args.add(call.operands.get(0).accept(this));
    } else {
        for (RexNode operand : call.operands) {
            args.add(operand.accept(this));
        }
    }
    // If Call is a redundant cast then bail out. Ex: cast(true)BOOLEAN
    if (call.isA(SqlKind.CAST) && (call.operands.size() == 1) && SqlTypeUtil.equalSansNullability(dTFactory, call.getType(), call.operands.get(0).getType())) {
        return args.get(0);
    } else {
        GenericUDF hiveUdf = SqlFunctionConverter.getHiveUDF(call.getOperator(), call.getType(), args.size());
        if (hiveUdf == null) {
            throw new RuntimeException("Cannot find UDF for " + call.getType() + " " + call.getOperator() + "[" + call.getOperator().getKind() + "]/" + args.size());
        }
        try {
            gfDesc = ExprNodeGenericFuncDesc.newInstance(hiveUdf, args);
        } catch (UDFArgumentException e) {
            LOG.error("Failed to instantiate udf: ", e);
            throw new RuntimeException(e);
        }
    }
    // Try to fold if it is a constant expression
    if (foldExpr && RexUtil.isConstant(call)) {
        ExprNodeDesc constantExpr = ConstantPropagateProcFactory.foldExpr((ExprNodeGenericFuncDesc) gfDesc);
        if (constantExpr != null) {
            gfDesc = constantExpr;
        }
    }
    return gfDesc;
}

Also used : UDFArgumentException(org.apache.hadoop.hive.ql.exec.UDFArgumentException) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) LinkedList(java.util.LinkedList) RexNode(org.apache.calcite.rex.RexNode)

Example 20 with GenericUDF

use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.

the class TestVectorizationContext method testInBloomFilter.

@Test
public void testInBloomFilter() throws Exception {
    // Setup InBloomFilter() UDF
    ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(TypeInfoFactory.getDecimalTypeInfo(10, 5), "a", "table", false);
    ExprNodeDesc bfExpr = new ExprNodeDynamicValueDesc(new DynamicValue("id1", TypeInfoFactory.binaryTypeInfo));
    ExprNodeGenericFuncDesc inBloomFilterExpr = new ExprNodeGenericFuncDesc();
    GenericUDF inBloomFilterUdf = new GenericUDFInBloomFilter();
    inBloomFilterExpr.setTypeInfo(TypeInfoFactory.booleanTypeInfo);
    inBloomFilterExpr.setGenericUDF(inBloomFilterUdf);
    List<ExprNodeDesc> children1 = new ArrayList<ExprNodeDesc>(2);
    children1.add(colExpr);
    children1.add(bfExpr);
    inBloomFilterExpr.setChildren(children1);
    // Setup VectorizationContext
    List<String> columns = new ArrayList<String>();
    columns.add("b");
    columns.add("a");
    VectorizationContext vc = new VectorizationContext("name", columns);
    // Create vectorized expr
    VectorExpression ve = vc.getVectorExpression(inBloomFilterExpr, VectorExpressionDescriptor.Mode.FILTER);
    Assert.assertEquals(VectorInBloomFilterColDynamicValue.class, ve.getClass());
    VectorInBloomFilterColDynamicValue vectorizedInBloomFilterExpr = (VectorInBloomFilterColDynamicValue) ve;
    VectorExpression[] children = vectorizedInBloomFilterExpr.getChildExpressions();
    // VectorInBloomFilterColDynamicValue should have all of the necessary information to vectorize.
    // Should be no need for child vector expressions, which would imply casting/conversion.
    Assert.assertNull(children);
}

Also used : ExprNodeDynamicValueDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc) ArrayList(java.util.ArrayList) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) GenericUDFInBloomFilter(org.apache.hadoop.hive.ql.udf.generic.GenericUDFInBloomFilter) VectorInBloomFilterColDynamicValue(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorInBloomFilterColDynamicValue) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) VectorExpression(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression) DynamicValueVectorExpression(org.apache.hadoop.hive.ql.exec.vector.expressions.DynamicValueVectorExpression) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) DynamicValue(org.apache.hadoop.hive.ql.plan.DynamicValue) VectorInBloomFilterColDynamicValue(org.apache.hadoop.hive.ql.exec.vector.expressions.VectorInBloomFilterColDynamicValue) Test(org.junit.Test)

Aggregations

GenericUDF (org.apache.hadoop.hive.ql.udf.generic.GenericUDF)36 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)26 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)26 ArrayList (java.util.ArrayList)20 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)19 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)13 Test (org.junit.Test)13 GenericUDFBridge (org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge)7 VectorExpression (org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression)6 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)6 ExprNodeFieldDesc (org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc)5 GenericUDFBaseCompare (org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare)5 List (java.util.List)4 DynamicValueVectorExpression (org.apache.hadoop.hive.ql.exec.vector.expressions.DynamicValueVectorExpression)4 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)4 GenericUDFOPAnd (org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd)4 LinkedList (java.util.LinkedList)3 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)3 UDF (org.apache.hadoop.hive.ql.exec.UDF)3 UDFArgumentException (org.apache.hadoop.hive.ql.exec.UDFArgumentException)3