use of org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext in project hive by apache.
the class ConvertJoinMapJoin method process.
@Override
public /*
* (non-Javadoc) we should ideally not modify the tree we traverse. However,
* since we need to walk the tree at any time when we modify the operator, we
* might as well do it here.
*/
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
OptimizeTezProcContext context = (OptimizeTezProcContext) procCtx;
hashTableLoadFactor = context.conf.getFloatVar(ConfVars.HIVEHASHTABLELOADFACTOR);
fastHashTableAvailable = context.conf.getBoolVar(ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED);
JoinOperator joinOp = (JoinOperator) nd;
// adjust noconditional task size threshold for LLAP
LlapClusterStateForCompile llapInfo = null;
if ("llap".equalsIgnoreCase(context.conf.getVar(ConfVars.HIVE_EXECUTION_MODE))) {
llapInfo = LlapClusterStateForCompile.getClusterInfo(context.conf);
llapInfo.initClusterInfo();
}
MemoryMonitorInfo memoryMonitorInfo = getMemoryMonitorInfo(context.conf, llapInfo);
joinOp.getConf().setMemoryMonitorInfo(memoryMonitorInfo);
maxJoinMemory = memoryMonitorInfo.getAdjustedNoConditionalTaskSize();
LOG.info("maxJoinMemory: {}", maxJoinMemory);
hashMapDataStructure = HashMapDataStructureType.of(joinOp.getConf());
TezBucketJoinProcCtx tezBucketJoinProcCtx = new TezBucketJoinProcCtx(context.conf);
boolean hiveConvertJoin = context.conf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN) & !context.parseContext.getDisableMapJoin();
if (!hiveConvertJoin) {
// we are just converting to a common merge join operator. The shuffle
// join in map-reduce case.
Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx);
if (retval == null) {
return retval;
} else {
fallbackToReduceSideJoin(joinOp, context);
return null;
}
}
// if we have traits, and table info is present in the traits, we know the
// exact number of buckets. Else choose the largest number of estimated
// reducers from the parent operators.
int numBuckets = -1;
if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)) {
numBuckets = estimateNumBuckets(joinOp, true);
} else {
numBuckets = 1;
}
LOG.info("Estimated number of buckets " + numBuckets);
MapJoinConversion mapJoinConversion = getMapJoinConversion(joinOp, context, numBuckets, false, maxJoinMemory, true);
if (mapJoinConversion == null) {
Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx);
if (retval == null) {
return retval;
} else {
// only case is full outer join with SMB enabled which is not possible. Convert to regular
// join.
fallbackToReduceSideJoin(joinOp, context);
return null;
}
}
if (numBuckets > 1) {
if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)) {
// Check if we are in LLAP, if so it needs to be determined if we should use BMJ or DPHJ
if (llapInfo != null) {
if (selectJoinForLlap(context, joinOp, tezBucketJoinProcCtx, llapInfo, mapJoinConversion, numBuckets)) {
return null;
}
} else if (convertJoinBucketMapJoin(joinOp, context, mapJoinConversion, tezBucketJoinProcCtx)) {
return null;
}
}
}
// check if we can convert to map join no bucket scaling.
LOG.info("Convert to non-bucketed map join");
if (numBuckets != 1) {
mapJoinConversion = getMapJoinConversion(joinOp, context, 1, false, maxJoinMemory, true);
}
if (mapJoinConversion == null) {
// we are just converting to a common merge join operator. The shuffle
// join in map-reduce case.
fallbackToReduceSideJoin(joinOp, context);
return null;
}
// Currently, this is a MJ path and we don's support FULL OUTER MapJoin yet.
if (mapJoinConversion.getIsFullOuterJoin() && !mapJoinConversion.getIsFullOuterEnabledForMapJoin()) {
fallbackToReduceSideJoin(joinOp, context);
return null;
}
MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, mapJoinConversion, true);
if (mapJoinOp == null) {
fallbackToReduceSideJoin(joinOp, context);
return null;
}
// map join operator by default has no bucket cols and num of reduce sinks
// reduced by 1
mapJoinOp.setOpTraits(new OpTraits(null, -1, null, joinOp.getOpTraits().getNumReduceSinks()));
preserveOperatorInfos(mapJoinOp, joinOp, context);
// propagate this change till the next RS
for (Operator<? extends OperatorDesc> childOp : mapJoinOp.getChildOperators()) {
setAllChildrenTraits(childOp, mapJoinOp.getOpTraits());
}
return null;
}
use of org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext in project hive by apache.
the class DynamicPartitionPruningOptimization method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
ParseContext parseContext;
if (procCtx instanceof OptimizeTezProcContext) {
parseContext = ((OptimizeTezProcContext) procCtx).parseContext;
} else if (procCtx instanceof OptimizeSparkProcContext) {
parseContext = ((OptimizeSparkProcContext) procCtx).getParseContext();
} else {
throw new IllegalArgumentException("expected parseContext to be either " + "OptimizeTezProcContext or OptimizeSparkProcContext, but found " + procCtx.getClass().getName());
}
FilterOperator filter = (FilterOperator) nd;
FilterDesc desc = filter.getConf();
if (!parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING) && !parseContext.getConf().isSparkDPPAny()) {
// nothing to do when the optimization is off
return null;
}
TableScanOperator ts = null;
if (filter.getParentOperators().size() == 1 && filter.getParentOperators().get(0) instanceof TableScanOperator) {
ts = (TableScanOperator) filter.getParentOperators().get(0);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Parent: " + filter.getParentOperators().get(0));
LOG.debug("Filter: " + desc.getPredicateString());
LOG.debug("TableScan: " + ts);
}
DynamicPartitionPrunerContext removerContext = new DynamicPartitionPrunerContext();
// collect the dynamic pruning conditions
removerContext.dynLists.clear();
GenTezUtils.collectDynamicPruningConditions(desc.getPredicate(), removerContext);
if (ts == null) {
// Replace the synthetic predicate with true and bail out
for (DynamicListContext ctx : removerContext) {
ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
replaceExprNode(ctx, desc, constNode);
}
return false;
}
boolean semiJoin = parseContext.getConf().getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION);
if (HiveConf.getVar(parseContext.getConf(), HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
// TODO HIVE-16862: Implement a similar feature like "hive.tez.dynamic.semijoin.reduction" in hive on spark
semiJoin = false;
}
List<ExprNodeDesc> newBetweenNodes = new ArrayList<>();
List<ExprNodeDesc> newBloomFilterNodes = new ArrayList<>();
for (DynamicListContext ctx : removerContext) {
if (ctx.desc.getTypeInfo().getCategory() != ObjectInspector.Category.PRIMITIVE) {
// https://issues.apache.org/jira/browse/HIVE-24988
continue;
}
String column = ExprNodeDescUtils.extractColName(ctx.parent);
boolean semiJoinAttempted = false;
ExprNodeDesc constNode = new ExprNodeConstantDesc(ctx.parent.getTypeInfo(), true);
if (column != null) {
// Need unique IDs to refer to each min/max key value in the DynamicValueRegistry
String keyBaseAlias = "";
Table table = ts.getConf().getTableMetadata();
boolean nonEquiJoin = isNonEquiJoin(ctx.parent);
if (table != null && table.isPartitionKey(column) && !nonEquiJoin) {
String columnType = table.getPartColByName(column).getType();
String alias = ts.getConf().getAlias();
PrunedPartitionList plist = parseContext.getPrunedPartitions(alias, ts);
if (LOG.isDebugEnabled()) {
LOG.debug("alias: " + alias);
LOG.debug("pruned partition list: ");
if (plist != null) {
for (Partition p : plist.getPartitions()) {
LOG.debug(p.getCompleteName());
}
}
}
// have been already filtered
if (plist == null || plist.getPartitions().size() != 0) {
LOG.info("Dynamic partitioning: " + table.getCompleteName() + "." + column);
generateEventOperatorPlan(ctx, parseContext, ts, column, columnType, null);
} else {
// all partitions have been statically removed
LOG.debug("No partition pruning necessary.");
}
} else if (table.isNonNative() && table.getStorageHandler().addDynamicSplitPruningEdge(table, ctx.parent)) {
generateEventOperatorPlan(ctx, parseContext, ts, column, table.getCols().stream().filter(e -> e.getName().equals(column)).map(e -> e.getType()).findFirst().get(), ctx.parent);
} else {
// semijoin
LOG.debug("Column " + column + " is not a partition column");
if (semiJoin && !disableSemiJoinOptDueToExternalTable(parseContext.getConf(), ts, ctx) && ts.getConf().getFilterExpr() != null && !nonEquiJoin) {
LOG.debug("Initiate semijoin reduction for " + column + " (" + ts.getConf().getFilterExpr().getExprString());
StringBuilder internalColNameBuilder = new StringBuilder();
StringBuilder colNameBuilder = new StringBuilder();
// Apply best effort to fetch the correct table alias. If not
// found, fallback to old logic.
StringBuilder tabAliasBuilder = new StringBuilder();
if (getColumnInfo(ctx, internalColNameBuilder, colNameBuilder, tabAliasBuilder)) {
String colName = colNameBuilder.toString();
String tableAlias;
if (tabAliasBuilder.length() > 0) {
tableAlias = tabAliasBuilder.toString();
} else {
// falling back
Operator<?> op = ctx.generator;
while (!(op == null || op instanceof TableScanOperator)) {
op = op.getParentOperators().get(0);
}
tableAlias = (op == null ? "" : ((TableScanOperator) op).getConf().getAlias());
}
// Use the tableAlias to generate keyBaseAlias
keyBaseAlias = ctx.generator.getOperatorId() + "_" + tableAlias + "_" + colName;
Map<String, List<SemiJoinHint>> hints = parseContext.getSemiJoinHints();
if (hints != null) {
// Create semijoin optimizations ONLY for hinted columns
semiJoinAttempted = processSemiJoinHints(parseContext, ctx, hints, tableAlias, internalColNameBuilder.toString(), colName, ts, keyBaseAlias);
} else {
// fallback to regular logic
semiJoinAttempted = generateSemiJoinOperatorPlan(ctx, parseContext, ts, keyBaseAlias, internalColNameBuilder.toString(), colName, null);
}
}
}
}
// we always remove the condition by replacing it with "true"
if (semiJoinAttempted) {
List<ExprNodeDesc> betweenArgs = new ArrayList<ExprNodeDesc>();
// Do not invert between result
betweenArgs.add(new ExprNodeConstantDesc(Boolean.FALSE));
// add column expression here
betweenArgs.add(ctx.parent.getChildren().get(0));
betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_min", ctx.desc.getTypeInfo())));
betweenArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_max", ctx.desc.getTypeInfo())));
ExprNodeDesc betweenNode = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("between").getGenericUDF(), betweenArgs);
// add column expression for bloom filter
List<ExprNodeDesc> bloomFilterArgs = new ArrayList<ExprNodeDesc>();
bloomFilterArgs.add(ctx.parent.getChildren().get(0));
bloomFilterArgs.add(new ExprNodeDynamicValueDesc(new DynamicValue(keyBaseAlias + "_bloom_filter", TypeInfoFactory.binaryTypeInfo)));
ExprNodeDesc bloomFilterNode = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("in_bloom_filter").getGenericUDF(), bloomFilterArgs);
newBetweenNodes.add(betweenNode);
newBloomFilterNodes.add(bloomFilterNode);
}
}
replaceExprNode(ctx, desc, constNode);
}
if (!newBetweenNodes.isEmpty()) {
// We need to add the new nodes: first the between nodes, then the bloom filters
if (FunctionRegistry.isOpAnd(desc.getPredicate())) {
// AND
desc.getPredicate().getChildren().addAll(newBetweenNodes);
desc.getPredicate().getChildren().addAll(newBloomFilterNodes);
} else {
List<ExprNodeDesc> andArgs = new ArrayList<>();
andArgs.add(desc.getPredicate());
andArgs.addAll(newBetweenNodes);
andArgs.addAll(newBloomFilterNodes);
ExprNodeGenericFuncDesc andExpr = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("and").getGenericUDF(), andArgs);
// Also pass in filter as tableScan filterExpr
ts.getConf().setFilterExpr(andExpr);
desc.setPredicate(andExpr);
}
}
// if we pushed the predicate into the table scan we need to remove the
// synthetic conditions there.
cleanTableScanFilters(ts);
return false;
}
use of org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext in project hive by apache.
the class RemoveDynamicPruningBySize method process.
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeTezProcContext context = (OptimizeTezProcContext) procContext;
AppMasterEventOperator event = (AppMasterEventOperator) nd;
AppMasterEventDesc desc = event.getConf();
if (desc.getStatistics().getDataSize() > context.conf.getLongVar(ConfVars.TEZ_DYNAMIC_PARTITION_PRUNING_MAX_DATA_SIZE) && (context.pruningOpsRemovedByPriorOpt.isEmpty() || !context.pruningOpsRemovedByPriorOpt.contains(event))) {
context.pruningOpsRemovedByPriorOpt.add(event);
GenTezUtils.removeBranch(event);
// at this point we've found the fork in the op pipeline that has the pruning as a child plan.
LOG.info("Disabling dynamic pruning for: " + ((DynamicPruningEventDesc) desc).getTableScan().getName() + ". Expected data size is too big: " + desc.getStatistics().getDataSize());
}
return false;
}
use of org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext in project hive by apache.
the class SetReducerParallelism method process.
@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeTezProcContext context = (OptimizeTezProcContext) procContext;
ReduceSinkOperator sink = (ReduceSinkOperator) nd;
ReduceSinkDesc desc = sink.getConf();
long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
int maxReducers = context.conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
int constantReducers = context.conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
if (context.visitedReduceSinks.contains(sink)) {
// skip walking the children
LOG.debug("Already processed reduce sink: " + sink.getName());
return true;
}
context.visitedReduceSinks.add(sink);
if (desc.getNumReducers() <= 0) {
if (constantReducers > 0) {
LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
desc.setNumReducers(constantReducers);
} else {
long numberOfBytes = 0;
// we need to add up all the estimates from the siblings of this reduce sink
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
if (sibling.getStatistics() != null) {
numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
} else {
LOG.warn("No stats available from: " + sibling);
}
}
int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers);
desc.setNumReducers(numReducers);
final Collection<ExprNodeDescEqualityWrapper> keyCols = ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
final Collection<ExprNodeDescEqualityWrapper> partCols = ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
if (keyCols != null && keyCols.equals(partCols)) {
desc.setReducerTraits(EnumSet.of(UNIFORM, AUTOPARALLEL));
} else {
desc.setReducerTraits(EnumSet.of(AUTOPARALLEL));
}
}
} else {
LOG.info("Number of reducers determined to be: " + desc.getNumReducers());
// usually controlled by bucketing
desc.setReducerTraits(EnumSet.of(FIXED));
}
return false;
}
Aggregations