use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class RexNodeConverter method convert.
private RexNode convert(ExprNodeGenericFuncDesc func) throws SemanticException {
ExprNodeDesc tmpExprNode;
RexNode tmpRN;
List<RexNode> childRexNodeLst = new ArrayList<RexNode>();
Builder<RelDataType> argTypeBldr = ImmutableList.<RelDataType>builder();
// TODO: 1) Expand to other functions as needed 2) What about types other than primitive.
TypeInfo tgtDT = null;
GenericUDF tgtUdf = func.getGenericUDF();
boolean isNumeric = (tgtUdf instanceof GenericUDFBaseBinary && func.getTypeInfo().getCategory() == Category.PRIMITIVE && (PrimitiveGrouping.NUMERIC_GROUP == PrimitiveObjectInspectorUtils.getPrimitiveGrouping(((PrimitiveTypeInfo) func.getTypeInfo()).getPrimitiveCategory())));
boolean isCompare = !isNumeric && tgtUdf instanceof GenericUDFBaseCompare;
boolean isWhenCase = tgtUdf instanceof GenericUDFWhen || tgtUdf instanceof GenericUDFCase;
boolean isTransformableTimeStamp = func.getGenericUDF() instanceof GenericUDFUnixTimeStamp && func.getChildren().size() != 0;
if (isNumeric) {
tgtDT = func.getTypeInfo();
assert func.getChildren().size() == 2;
// TODO: checking 2 children is useless, compare already does that.
} else if (isCompare && (func.getChildren().size() == 2)) {
tgtDT = FunctionRegistry.getCommonClassForComparison(func.getChildren().get(0).getTypeInfo(), func.getChildren().get(1).getTypeInfo());
} else if (isWhenCase) {
// as they are not allowed
if (checkForStatefulFunctions(func.getChildren())) {
throw new SemanticException("Stateful expressions cannot be used inside of CASE");
}
} else if (isTransformableTimeStamp) {
// unix_timestamp(args) -> to_unix_timestamp(args)
func = ExprNodeGenericFuncDesc.newInstance(new GenericUDFToUnixTimeStamp(), func.getChildren());
}
for (ExprNodeDesc childExpr : func.getChildren()) {
tmpExprNode = childExpr;
if (tgtDT != null && TypeInfoUtils.isConversionRequiredForComparison(tgtDT, childExpr.getTypeInfo())) {
if (isCompare) {
// For compare, we will convert requisite children
tmpExprNode = ParseUtils.createConversionCast(childExpr, (PrimitiveTypeInfo) tgtDT);
} else if (isNumeric) {
// For numeric, we'll do minimum necessary cast - if we cast to the type
// of expression, bad things will happen.
PrimitiveTypeInfo minArgType = ExprNodeDescUtils.deriveMinArgumentCast(childExpr, tgtDT);
tmpExprNode = ParseUtils.createConversionCast(childExpr, minArgType);
} else {
throw new AssertionError("Unexpected " + tgtDT + " - not a numeric op or compare");
}
}
argTypeBldr.add(TypeConverter.convert(tmpExprNode.getTypeInfo(), cluster.getTypeFactory()));
tmpRN = convert(tmpExprNode);
childRexNodeLst.add(tmpRN);
}
// See if this is an explicit cast.
RexNode expr = null;
RelDataType retType = null;
expr = handleExplicitCast(func, childRexNodeLst);
if (expr == null) {
// This is not a cast; process the function.
retType = TypeConverter.convert(func.getTypeInfo(), cluster.getTypeFactory());
SqlOperator calciteOp = SqlFunctionConverter.getCalciteOperator(func.getFuncText(), func.getGenericUDF(), argTypeBldr.build(), retType);
if (calciteOp.getKind() == SqlKind.CASE) {
// If it is a case operator, we need to rewrite it
childRexNodeLst = rewriteCaseChildren(func, childRexNodeLst);
} else if (HiveExtractDate.ALL_FUNCTIONS.contains(calciteOp)) {
// If it is a extract operator, we need to rewrite it
childRexNodeLst = rewriteExtractDateChildren(calciteOp, childRexNodeLst);
} else if (HiveFloorDate.ALL_FUNCTIONS.contains(calciteOp)) {
// If it is a floor <date> operator, we need to rewrite it
childRexNodeLst = rewriteFloorDateChildren(calciteOp, childRexNodeLst);
}
expr = cluster.getRexBuilder().makeCall(calciteOp, childRexNodeLst);
} else {
retType = expr.getType();
}
// an exception
if (flattenExpr && (expr instanceof RexCall) && !(((RexCall) expr).getOperator() instanceof SqlCastFunction)) {
RexCall call = (RexCall) expr;
expr = cluster.getRexBuilder().makeCall(retType, call.getOperator(), RexUtil.flatten(call.getOperands(), call.getOperator()));
}
return expr;
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class StatsUtils method getNDVFor.
private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Statistics parentStats) {
GenericUDF udf = engfd.getGenericUDF();
if (!FunctionRegistry.isDeterministic(udf)) {
return numRows;
}
List<Long> ndvs = Lists.newArrayList();
Class<?> udfClass = udf instanceof GenericUDFBridge ? ((GenericUDFBridge) udf).getUdfClass() : udf.getClass();
NDV ndv = AnnotationUtils.getAnnotation(udfClass, NDV.class);
long udfNDV = Long.MAX_VALUE;
if (ndv != null) {
udfNDV = ndv.maxNdv();
} else {
for (String col : engfd.getCols()) {
ColStatistics stats = parentStats.getColumnStatisticsFromColName(col);
if (stats != null) {
ndvs.add(stats.getCountDistint());
}
}
}
long countDistincts = ndvs.isEmpty() ? numRows : addWithExpDecay(ndvs);
return Collections.min(Lists.newArrayList(countDistincts, udfNDV, numRows));
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class TestOrcSplitElimination method testSplitEliminationComplexExpr.
@Test
public void testSplitEliminationComplexExpr() throws Exception {
ObjectInspector inspector = createIO();
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
writeData(writer);
writer.close();
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 150000);
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
// predicate expression: userid <= 100 and subtype <= 1000.0
GenericUDF udf = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr = Lists.newArrayList();
ExprNodeColumnDesc col = new ExprNodeColumnDesc(Long.class, "userid", "T", false);
ExprNodeConstantDesc con = new ExprNodeConstantDesc(100);
childExpr.add(col);
childExpr.add(con);
ExprNodeGenericFuncDesc en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
GenericUDF udf1 = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr1 = Lists.newArrayList();
ExprNodeColumnDesc col1 = new ExprNodeColumnDesc(Double.class, "subtype", "T", false);
ExprNodeConstantDesc con1 = new ExprNodeConstantDesc(1000.0);
childExpr1.add(col1);
childExpr1.add(con1);
ExprNodeGenericFuncDesc en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
GenericUDF udf2 = new GenericUDFOPAnd();
List<ExprNodeDesc> childExpr2 = Lists.newArrayList();
childExpr2.add(en);
childExpr2.add(en1);
ExprNodeGenericFuncDesc en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
String sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(2);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
con1 = new ExprNodeConstantDesc(0.0);
childExpr1.set(1, con1);
en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
childExpr2.set(0, en);
childExpr2.set(1, en1);
en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// no stripe will satisfy the predicate
assertEquals(0, splits.length);
con = new ExprNodeConstantDesc(2);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
con1 = new ExprNodeConstantDesc(1.0);
childExpr1.set(1, con1);
en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
childExpr2.set(0, en);
childExpr2.set(1, en1);
en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// only first stripe will satisfy condition and hence single split
assertEquals(1, splits.length);
udf = new GenericUDFOPEqual();
con = new ExprNodeConstantDesc(13);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
con1 = new ExprNodeConstantDesc(80.0);
childExpr1.set(1, con1);
en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
childExpr2.set(0, en);
childExpr2.set(1, en1);
en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first two stripes will satisfy condition and hence single split
assertEquals(2, splits.length);
udf = new GenericUDFOPEqual();
con = new ExprNodeConstantDesc(13);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
udf1 = new GenericUDFOPEqual();
con1 = new ExprNodeConstantDesc(80.0);
childExpr1.set(1, con1);
en1 = new ExprNodeGenericFuncDesc(inspector, udf1, childExpr1);
childExpr2.set(0, en);
childExpr2.set(1, en1);
en2 = new ExprNodeGenericFuncDesc(inspector, udf2, childExpr2);
sargStr = SerializationUtilities.serializeExpression(en2);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// only second stripes will satisfy condition and hence single split
assertEquals(1, splits.length);
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class TestOrcSplitElimination method testSplitEliminationSmallMaxSplit.
@Test
public void testSplitEliminationSmallMaxSplit() throws Exception {
ObjectInspector inspector = createIO();
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
writeData(writer);
writer.close();
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 5000);
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
GenericUDF udf = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr = Lists.newArrayList();
ExprNodeConstantDesc con;
ExprNodeGenericFuncDesc en;
String sargStr;
createTestSarg(inspector, udf, childExpr);
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(5, splits.length);
con = new ExprNodeConstantDesc(1);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(0, splits.length);
con = new ExprNodeConstantDesc(2);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(1, splits.length);
con = new ExprNodeConstantDesc(5);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(13);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(3, splits.length);
con = new ExprNodeConstantDesc(29);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(4, splits.length);
con = new ExprNodeConstantDesc(70);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
assertEquals(5, splits.length);
}
use of org.apache.hadoop.hive.ql.udf.generic.GenericUDF in project hive by apache.
the class TestOrcSplitElimination method testSplitEliminationLargeMaxSplit.
@Test
public void testSplitEliminationLargeMaxSplit() throws Exception {
ObjectInspector inspector = createIO();
Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
writeData(writer);
writer.close();
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 150000);
InputFormat<?, ?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
GenericUDF udf = new GenericUDFOPEqualOrLessThan();
List<ExprNodeDesc> childExpr = Lists.newArrayList();
ExprNodeConstantDesc con;
ExprNodeGenericFuncDesc en;
String sargStr;
createTestSarg(inspector, udf, childExpr);
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(0);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// no stripes satisfies the condition
assertEquals(0, splits.length);
con = new ExprNodeConstantDesc(2);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// only first stripe will satisfy condition and hence single split
assertEquals(1, splits.length);
con = new ExprNodeConstantDesc(5);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first stripe will satisfy the predicate and will be a single split, last stripe will be a
// separate split
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(13);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first 2 stripes will satisfy the predicate and merged to single split, last stripe will be a
// separate split
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(29);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first 3 stripes will satisfy the predicate and merged to single split, last stripe will be a
// separate split
assertEquals(2, splits.length);
con = new ExprNodeConstantDesc(70);
childExpr.set(1, con);
en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
sargStr = SerializationUtilities.serializeExpression(en);
conf.set("hive.io.filter.expr.serialized", sargStr);
splits = in.getSplits(conf, 1);
// first 2 stripes will satisfy the predicate and merged to single split, last two stripe will
// be a separate split
assertEquals(2, splits.length);
}
Aggregations