use of org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc in project hive by apache.
the class KafkaInputFormat method computeSplits.
private List<KafkaInputSplit> computeSplits(Configuration configuration) throws IOException, InterruptedException {
// ExecutorService is used to harness some KAFKA blocking calls and interrupt after some duration
final ExecutorService execService = Executors.newSingleThreadExecutor();
try (KafkaConsumer consumer = new KafkaConsumer(KafkaUtils.consumerProperties(configuration))) {
final String topic = configuration.get(KafkaTableProperties.HIVE_KAFKA_TOPIC.getName());
final long timeoutMs = configuration.getLong(KafkaTableProperties.KAFKA_FETCH_METADATA_TIMEOUT.getName(), -1);
final int maxTries = configuration.getInt(KafkaTableProperties.MAX_RETRIES.getName(), -1);
// hive depends on FileSplits
JobConf jobConf = new JobConf(configuration);
Path[] tablePaths = org.apache.hadoop.mapred.FileInputFormat.getInputPaths(jobConf);
final Future<List<KafkaInputSplit>> futureFullHouse;
// noinspection unchecked
futureFullHouse = execService.submit(() -> buildFullScanFromKafka(topic, consumer, tablePaths, maxTries));
final List<KafkaInputSplit> fullHouse;
try {
fullHouse = futureFullHouse.get(timeoutMs, TimeUnit.MILLISECONDS);
} catch (TimeoutException | ExecutionException e) {
futureFullHouse.cancel(true);
LOG.error("can not generate full scan split", e);
// at this point we can not go further fail split generation
throw new IOException(e);
}
@SuppressWarnings("unchecked") final ImmutableMap.Builder<TopicPartition, KafkaInputSplit> fullHouseMapBuilder = new ImmutableMap.Builder();
fullHouse.forEach(input -> fullHouseMapBuilder.put(new TopicPartition(input.getTopic(), input.getPartition()), input));
final KafkaScanTrimmer kafkaScanTrimmer = new KafkaScanTrimmer(fullHouseMapBuilder.build(), consumer);
final String filterExprSerialized = configuration.get(TableScanDesc.FILTER_EXPR_CONF_STR);
if (filterExprSerialized != null && !filterExprSerialized.isEmpty()) {
ExprNodeGenericFuncDesc filterExpr = SerializationUtilities.deserializeExpression(filterExprSerialized);
LOG.info("Kafka trimmer working on Filter tree {}", filterExpr.getExprString());
Callable<List<KafkaInputSplit>> trimmerWorker = () -> kafkaScanTrimmer.computeOptimizedScan(filterExpr).entrySet().stream().map(Map.Entry::getValue).collect(Collectors.toList());
Future<List<KafkaInputSplit>> futureTinyHouse = execService.submit(trimmerWorker);
try {
return futureTinyHouse.get(timeoutMs, TimeUnit.MILLISECONDS).stream().filter(split -> split.getStartOffset() < split.getEndOffset()).collect(Collectors.toList());
} catch (ExecutionException | TimeoutException e) {
futureTinyHouse.cancel(true);
LOG.error("Had issue with trimmer will return full scan ", e);
return fullHouse;
}
}
// Case null: it can be filter evaluated to false or no filter at all thus return full scan
return fullHouse;
} finally {
execService.shutdown();
}
}
use of org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc in project hive by apache.
the class ConvertAstToSearchArg method parse.
/**
* Do the recursive parse of the Hive ExprNodeDesc into our ExpressionTree.
* @param expression the Hive ExprNodeDesc
*/
private void parse(ExprNodeDesc expression) {
// handle the special cases.
if (expression.getClass() != ExprNodeGenericFuncDesc.class) {
// if it is a reference to a boolean column, covert it to a truth test.
if (expression instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc columnDesc = (ExprNodeColumnDesc) expression;
if (columnDesc.getTypeString().equals("boolean")) {
builder.equals(columnDesc.getColumn(), PredicateLeaf.Type.BOOLEAN, true);
return;
}
}
// otherwise, we don't know what to do so make it a maybe
builder.literal(SearchArgument.TruthValue.YES_NO_NULL);
return;
}
// get the kind of expression
ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) expression;
Class<?> op = expr.getGenericUDF().getClass();
// handle the logical operators
if (op == GenericUDFOPOr.class) {
builder.startOr();
addChildren(expr);
builder.end();
} else if (op == GenericUDFOPAnd.class) {
builder.startAnd();
addChildren(expr);
builder.end();
} else if (op == GenericUDFOPNot.class) {
builder.startNot();
addChildren(expr);
builder.end();
} else if (op == GenericUDFOPEqual.class) {
createLeaf(PredicateLeaf.Operator.EQUALS, expr);
} else if (op == GenericUDFOPNotEqual.class) {
builder.startNot();
createLeaf(PredicateLeaf.Operator.EQUALS, expr);
builder.end();
} else if (op == GenericUDFOPEqualNS.class) {
createLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, expr);
} else if (op == GenericUDFOPGreaterThan.class) {
builder.startNot();
createLeaf(PredicateLeaf.Operator.LESS_THAN_EQUALS, expr);
builder.end();
} else if (op == GenericUDFOPEqualOrGreaterThan.class) {
builder.startNot();
createLeaf(PredicateLeaf.Operator.LESS_THAN, expr);
builder.end();
} else if (op == GenericUDFOPLessThan.class) {
createLeaf(PredicateLeaf.Operator.LESS_THAN, expr);
} else if (op == GenericUDFOPEqualOrLessThan.class) {
createLeaf(PredicateLeaf.Operator.LESS_THAN_EQUALS, expr);
} else if (op == GenericUDFIn.class) {
createLeaf(PredicateLeaf.Operator.IN, expr, 0);
} else if (op == GenericUDFBetween.class) {
// Start with NOT operator when the first child of GenericUDFBetween operator is set to TRUE
if (Boolean.TRUE.equals(((ExprNodeConstantDesc) expression.getChildren().get(0)).getValue())) {
builder.startNot();
createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1);
builder.end();
} else {
createLeaf(PredicateLeaf.Operator.BETWEEN, expr, 1);
}
} else if (op == GenericUDFOPNull.class) {
createLeaf(PredicateLeaf.Operator.IS_NULL, expr, 0);
} else if (op == GenericUDFOPNotNull.class) {
builder.startNot();
createLeaf(PredicateLeaf.Operator.IS_NULL, expr, 0);
builder.end();
// otherwise, we didn't understand it, so mark it maybe
} else {
builder.literal(SearchArgument.TruthValue.YES_NO_NULL);
}
}
use of org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc in project hive by apache.
the class ProjectionPusher method pushProjectionsAndFilters.
private void pushProjectionsAndFilters(final JobConf jobConf, final String splitPath, final String splitPathWithNoSchema) {
if (mapWork == null) {
return;
} else if (mapWork.getPathToAliases() == null) {
return;
}
final Set<String> aliases = new HashSet<String>();
try {
List<String> a = HiveFileFormatUtils.getFromPathRecursively(mapWork.getPathToAliases(), new Path(splitPath), null, false, true);
if (a != null) {
aliases.addAll(a);
}
if (a == null || a.isEmpty()) {
// TODO: not having aliases for path usually means some bug. Should it give up?
LOG.warn("Couldn't find aliases for " + splitPath);
}
} catch (IllegalArgumentException | IOException e) {
throw new RuntimeException(e);
}
// Collect the needed columns from all the aliases and create ORed filter
// expression for the table.
boolean allColumnsNeeded = false;
boolean noFilters = false;
Set<Integer> neededColumnIDs = new HashSet<Integer>();
// To support nested column pruning, we need to track the path from the top to the nested
// fields
Set<String> neededNestedColumnPaths = new HashSet<String>();
List<ExprNodeGenericFuncDesc> filterExprs = new ArrayList<ExprNodeGenericFuncDesc>();
RowSchema rowSchema = null;
for (String alias : aliases) {
final Operator<? extends Serializable> op = mapWork.getAliasToWork().get(alias);
if (op != null && op instanceof TableScanOperator) {
final TableScanOperator ts = (TableScanOperator) op;
if (ts.getNeededColumnIDs() == null) {
allColumnsNeeded = true;
} else {
neededColumnIDs.addAll(ts.getNeededColumnIDs());
if (ts.getNeededNestedColumnPaths() != null) {
neededNestedColumnPaths.addAll(ts.getNeededNestedColumnPaths());
}
}
rowSchema = ts.getSchema();
ExprNodeGenericFuncDesc filterExpr = ts.getConf() == null ? null : ts.getConf().getFilterExpr();
// No filter if any TS has no filter expression
noFilters = filterExpr == null;
filterExprs.add(filterExpr);
}
}
ExprNodeGenericFuncDesc tableFilterExpr = null;
if (!noFilters) {
try {
for (ExprNodeGenericFuncDesc filterExpr : filterExprs) {
if (tableFilterExpr == null) {
tableFilterExpr = filterExpr;
} else {
tableFilterExpr = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), Arrays.<ExprNodeDesc>asList(tableFilterExpr, filterExpr));
}
}
} catch (UDFArgumentException ex) {
LOG.debug("Turn off filtering due to " + ex);
tableFilterExpr = null;
}
}
// push down projections
if (!allColumnsNeeded) {
if (!neededColumnIDs.isEmpty()) {
ColumnProjectionUtils.appendReadColumns(jobConf, new ArrayList<Integer>(neededColumnIDs));
ColumnProjectionUtils.appendNestedColumnPaths(jobConf, new ArrayList<String>(neededNestedColumnPaths));
}
} else {
ColumnProjectionUtils.setReadAllColumns(jobConf);
}
pushFilters(jobConf, rowSchema, tableFilterExpr);
}
use of org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc in project hive by apache.
the class ConstantPropagateProcFactory method evaluateFunction.
/**
* Evaluate UDF
*
* @param udf UDF object
* @param exprs
* @param oldExprs
* @return null if expression cannot be evaluated (not all parameters are constants). Or evaluated
* ExprNodeConstantDesc if possible.
* @throws HiveException
*/
private static ExprNodeDesc evaluateFunction(GenericUDF udf, List<ExprNodeDesc> exprs, List<ExprNodeDesc> oldExprs) {
DeferredJavaObject[] arguments = new DeferredJavaObject[exprs.size()];
ObjectInspector[] argois = new ObjectInspector[exprs.size()];
for (int i = 0; i < exprs.size(); i++) {
ExprNodeDesc desc = exprs.get(i);
if (desc instanceof ExprNodeConstantDesc) {
ExprNodeConstantDesc constant = (ExprNodeConstantDesc) exprs.get(i);
if (!constant.getTypeInfo().equals(oldExprs.get(i).getTypeInfo())) {
constant = typeCast(constant, oldExprs.get(i).getTypeInfo());
if (constant == null) {
return null;
}
}
if (constant.getTypeInfo().getCategory() != Category.PRIMITIVE) {
// nested complex types cannot be folded cleanly
return null;
}
Object value = constant.getValue();
PrimitiveTypeInfo pti = (PrimitiveTypeInfo) constant.getTypeInfo();
Object writableValue = null == value ? value : PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(pti).getPrimitiveWritableObject(value);
arguments[i] = new DeferredJavaObject(writableValue);
argois[i] = ObjectInspectorUtils.getConstantObjectInspector(constant.getWritableObjectInspector(), writableValue);
} else if (desc instanceof ExprNodeGenericFuncDesc) {
ExprNodeDesc evaluatedFn = foldExpr((ExprNodeGenericFuncDesc) desc);
if (null == evaluatedFn || !(evaluatedFn instanceof ExprNodeConstantDesc)) {
return null;
}
ExprNodeConstantDesc constant = (ExprNodeConstantDesc) evaluatedFn;
if (constant.getTypeInfo().getCategory() != Category.PRIMITIVE) {
// nested complex types cannot be folded cleanly
return null;
}
Object writableValue = PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector((PrimitiveTypeInfo) constant.getTypeInfo()).getPrimitiveWritableObject(constant.getValue());
arguments[i] = new DeferredJavaObject(writableValue);
argois[i] = ObjectInspectorUtils.getConstantObjectInspector(constant.getWritableObjectInspector(), writableValue);
} else {
return null;
}
}
try {
ObjectInspector oi = udf.initialize(argois);
Object o = udf.evaluate(arguments);
if (LOG.isDebugEnabled()) {
LOG.debug(udf.getClass().getName() + "(" + exprs + ")=" + o);
}
if (o == null) {
return new ExprNodeConstantDesc(TypeInfoUtils.getTypeInfoFromObjectInspector(oi), o);
}
Class<?> clz = o.getClass();
if (PrimitiveObjectInspectorUtils.isPrimitiveWritableClass(clz)) {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
TypeInfo typeInfo = poi.getTypeInfo();
o = poi.getPrimitiveJavaObject(o);
if (typeInfo.getTypeName().contains(serdeConstants.DECIMAL_TYPE_NAME) || typeInfo.getTypeName().contains(serdeConstants.VARCHAR_TYPE_NAME) || typeInfo.getTypeName().contains(serdeConstants.CHAR_TYPE_NAME) || typeInfo.getTypeName().contains(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME)) {
return new ExprNodeConstantDesc(typeInfo, o);
}
} else if (udf instanceof GenericUDFStruct && oi instanceof StandardConstantStructObjectInspector) {
// do not fold named_struct, only struct()
ConstantObjectInspector coi = (ConstantObjectInspector) oi;
TypeInfo structType = TypeInfoUtils.getTypeInfoFromObjectInspector(coi);
return new ExprNodeConstantDesc(structType, ObjectInspectorUtils.copyToStandardJavaObject(o, coi));
} else if (!PrimitiveObjectInspectorUtils.isPrimitiveJavaClass(clz)) {
if (LOG.isErrorEnabled()) {
LOG.error("Unable to evaluate {}({}). Return value unrecoginizable.", udf.getClass().getName(), exprs);
}
return null;
} else {
// fall through
}
String constStr = null;
if (arguments.length == 1 && FunctionRegistry.isOpCast(udf)) {
// remember original string representation of constant.
constStr = arguments[0].get().toString();
}
return new ExprNodeConstantDesc(o).setFoldedFromVal(constStr);
} catch (HiveException e) {
LOG.error("Evaluation function {}({}) failed in Constant Propagation Optimizer.", udf.getClass().getName(), exprs);
throw new RuntimeException(e);
}
}
use of org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc in project hive by apache.
the class ConstantPropagateProcFactory method foldExprFull.
/**
* Fold input expression desc.
*
* This function recursively checks if any subexpression of a specified expression
* can be evaluated to be constant and replaces such subexpression with the constant.
* If the expression is a deterministic UDF and all the subexpressions are constants,
* the value will be calculated immediately (during compilation time vs. runtime).
* e.g.:
* concat(year, month) => 200112 for year=2001, month=12 since concat is deterministic UDF
* unix_timestamp(time) => unix_timestamp(123) for time=123 since unix_timestamp is nondeterministic UDF
* @param desc folding expression
* @param constants current propagated constant map
* @param cppCtx
* @param op processing operator
* @param propagate if true, assignment expressions will be added to constants.
* @return fold expression
* @throws UDFArgumentException
*/
private static ExprNodeDesc foldExprFull(ExprNodeDesc desc, Map<ColumnInfo, ExprNodeDesc> constants, ConstantPropagateProcCtx cppCtx, Operator<? extends Serializable> op, int tag, boolean propagate) throws UDFArgumentException {
// Combine NOT operator with the child operator. Otherwise, the following optimization
// from bottom up could lead to incorrect result, such as not(x > 3 and x is not null),
// should not be optimized to not(x > 3), but (x <=3 or x is null).
desc = foldNegative(desc);
if (desc instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc funcDesc = (ExprNodeGenericFuncDesc) desc;
GenericUDF udf = funcDesc.getGenericUDF();
boolean propagateNext = propagate && propagatableUdfs.contains(udf.getClass());
List<ExprNodeDesc> newExprs = new ArrayList<ExprNodeDesc>();
for (ExprNodeDesc childExpr : desc.getChildren()) {
newExprs.add(foldExpr(childExpr, constants, cppCtx, op, tag, propagateNext));
}
// Don't evaluate nondeterministic function since the value can only calculate during runtime.
if (!isConstantFoldableUdf(udf, newExprs)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Function " + udf.getClass() + " is undeterministic. Don't evaluate immediately.");
}
((ExprNodeGenericFuncDesc) desc).setChildren(newExprs);
return desc;
} else {
// If all child expressions of deterministic function are constants, evaluate such UDF immediately
ExprNodeDesc constant = evaluateFunction(udf, newExprs, desc.getChildren());
if (constant != null) {
LOG.debug("Folding expression: {} -> {}", desc, constant);
return constant;
} else {
// Check if the function can be short cut.
ExprNodeDesc shortcut = shortcutFunction(udf, newExprs, op);
if (shortcut != null) {
LOG.debug("Folding expression: {} -> {}", desc, shortcut);
return shortcut;
}
((ExprNodeGenericFuncDesc) desc).setChildren(newExprs);
}
// constant, add them to colToConstants as half-deterministic columns.
if (propagate) {
propagate(udf, newExprs, op.getSchema(), constants);
}
}
return desc;
} else if (desc instanceof ExprNodeColumnDesc) {
if (op.getParentOperators() == null || op.getParentOperators().isEmpty()) {
return desc;
}
Operator<? extends Serializable> parent = op.getParentOperators().get(tag);
ExprNodeDesc col = evaluateColumn((ExprNodeColumnDesc) desc, cppCtx, parent);
if (col != null) {
LOG.debug("Folding expression: {} -> {}", desc, col);
return col;
}
}
return desc;
}
Aggregations