Search in sources :

Example 11 with SelectOperator

use of org.apache.hadoop.hive.ql.exec.SelectOperator in project hive by apache.

the class TezCompiler method removeSemijoinOptimizationByBenefit.

private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx) throws SemanticException {
    if (!procCtx.conf.getBoolVar(ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION)) {
        // Not needed without semi-join reduction
        return;
    }
    List<ReduceSinkOperator> semijoinRsToRemove = new ArrayList<ReduceSinkOperator>();
    Map<ReduceSinkOperator, SemiJoinBranchInfo> map = procCtx.parseContext.getRsToSemiJoinBranchInfo();
    double semijoinReductionThreshold = procCtx.conf.getFloatVar(HiveConf.ConfVars.TEZ_DYNAMIC_SEMIJOIN_REDUCTION_THRESHOLD);
    for (ReduceSinkOperator rs : map.keySet()) {
        SemiJoinBranchInfo sjInfo = map.get(rs);
        if (sjInfo.getIsHint() || !sjInfo.getShouldRemove()) {
            // Semijoin created using hint or marked useful, skip it
            continue;
        }
        // rs is semijoin optimization branch, which should look like <Parent>-SEL-GB1-RS1-GB2-RS2
        // Get to the SelectOperator ancestor
        SelectOperator sel = null;
        for (Operator<?> currOp = rs; currOp.getParentOperators().size() > 0; currOp = currOp.getParentOperators().get(0)) {
            if (currOp instanceof SelectOperator) {
                sel = (SelectOperator) currOp;
                break;
            }
        }
        if (sel == null) {
            throw new SemanticException("Unexpected error - could not find SEL ancestor from semijoin branch of " + rs);
        }
        // Check the ndv/rows from the SEL vs the destination tablescan the semijoin opt is going to.
        TableScanOperator ts = sjInfo.getTsOp();
        RuntimeValuesInfo rti = procCtx.parseContext.getRsToRuntimeValuesInfoMap().get(rs);
        ExprNodeDesc tsExpr = rti.getTsColExpr();
        // In the SEL operator of the semijoin branch, there should be only one column in the operator
        ExprNodeDesc selExpr = sel.getConf().getColList().get(0);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Computing BloomFilter cost/benefit for " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(ts) + " (" + tsExpr + ")");
        }
        double reductionFactor = computeBloomFilterNetBenefit(sel, selExpr, (FilterOperator) ts.getChildOperators().get(0), tsExpr);
        if (reductionFactor < semijoinReductionThreshold) {
            // This semijoin optimization should be removed. Do it after we're done iterating
            semijoinRsToRemove.add(rs);
        }
    }
    for (ReduceSinkOperator rs : semijoinRsToRemove) {
        TableScanOperator ts = map.get(rs).getTsOp();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Reduction factor not satisfied for " + OperatorUtils.getOpNamePretty(rs) + "-" + OperatorUtils.getOpNamePretty(ts) + ". Removing semijoin optimization.");
        }
        GenTezUtils.removeBranch(rs);
        GenTezUtils.removeSemiJoinOperator(procCtx.parseContext, rs, ts);
    }
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Example 12 with SelectOperator

use of org.apache.hadoop.hive.ql.exec.SelectOperator in project hive by apache.

the class ReduceSinkDeDuplicationUtils method aggressiveDedup.

protected static boolean aggressiveDedup(ReduceSinkOperator cRS, ReduceSinkOperator pRS, ReduceSinkDeduplicateProcCtx dedupCtx) throws SemanticException {
    assert cRS.getNumParent() == 1;
    ReduceSinkDesc cConf = cRS.getConf();
    ReduceSinkDesc pConf = pRS.getConf();
    List<ExprNodeDesc> cKeys = cConf.getKeyCols();
    List<ExprNodeDesc> pKeys = pConf.getKeyCols();
    // Check that in the path between cRS and pRS, there are only Select operators
    // i.e. the sequence must be pRS-SEL*-cRS
    Operator<? extends OperatorDesc> parent = cRS.getParentOperators().get(0);
    while (parent != pRS) {
        assert parent.getNumParent() == 1;
        if (!(parent instanceof SelectOperator)) {
            return false;
        }
        parent = parent.getParentOperators().get(0);
    }
    // If child keys are null or empty, we bail out
    if (cKeys == null || cKeys.isEmpty()) {
        return false;
    }
    // If parent keys are null or empty, we bail out
    if (pKeys == null || pKeys.isEmpty()) {
        return false;
    }
    // Backtrack key columns of cRS to pRS
    // If we cannot backtrack any of the columns, bail out
    List<ExprNodeDesc> cKeysInParentRS = ExprNodeDescUtils.backtrack(cKeys, cRS, pRS);
    for (int i = 0; i < cKeysInParentRS.size(); i++) {
        ExprNodeDesc pexpr = cKeysInParentRS.get(i);
        if (pexpr == null) {
            // We cannot backtrack the expression, we bail out
            return false;
        }
    }
    cRS.getConf().setKeyCols(ExprNodeDescUtils.backtrack(cKeysInParentRS, cRS, pRS));
    // Backtrack partition columns of cRS to pRS
    // If we cannot backtrack any of the columns, bail out
    List<ExprNodeDesc> cPartitionInParentRS = ExprNodeDescUtils.backtrack(cConf.getPartitionCols(), cRS, pRS);
    for (int i = 0; i < cPartitionInParentRS.size(); i++) {
        ExprNodeDesc pexpr = cPartitionInParentRS.get(i);
        if (pexpr == null) {
            // We cannot backtrack the expression, we bail out
            return false;
        }
    }
    cRS.getConf().setPartitionCols(ExprNodeDescUtils.backtrack(cPartitionInParentRS, cRS, pRS));
    // Backtrack value columns of cRS to pRS
    // If we cannot backtrack any of the columns, bail out
    List<ExprNodeDesc> cValueInParentRS = ExprNodeDescUtils.backtrack(cConf.getValueCols(), cRS, pRS);
    for (int i = 0; i < cValueInParentRS.size(); i++) {
        ExprNodeDesc pexpr = cValueInParentRS.get(i);
        if (pexpr == null) {
            // We cannot backtrack the expression, we bail out
            return false;
        }
    }
    cRS.getConf().setValueCols(ExprNodeDescUtils.backtrack(cValueInParentRS, cRS, pRS));
    // If we cannot backtrack any of the columns, bail out
    if (cConf.getBucketCols() != null) {
        List<ExprNodeDesc> cBucketInParentRS = ExprNodeDescUtils.backtrack(cConf.getBucketCols(), cRS, pRS);
        for (int i = 0; i < cBucketInParentRS.size(); i++) {
            ExprNodeDesc pexpr = cBucketInParentRS.get(i);
            if (pexpr == null) {
                // We cannot backtrack the expression, we bail out
                return false;
            }
        }
        cRS.getConf().setBucketCols(ExprNodeDescUtils.backtrack(cBucketInParentRS, cRS, pRS));
    }
    // Update column expression map
    for (Entry<String, ExprNodeDesc> e : cRS.getColumnExprMap().entrySet()) {
        e.setValue(ExprNodeDescUtils.backtrack(e.getValue(), cRS, pRS));
    }
    // Replace pRS with cRS and remove operator sequence from pRS to cRS
    // Recall that the sequence must be pRS-SEL*-cRS
    parent = cRS.getParentOperators().get(0);
    while (parent != pRS) {
        dedupCtx.addRemovedOperator(parent);
        parent = parent.getParentOperators().get(0);
    }
    dedupCtx.addRemovedOperator(pRS);
    cRS.getParentOperators().clear();
    for (Operator<? extends OperatorDesc> op : pRS.getParentOperators()) {
        op.replaceChild(pRS, cRS);
        cRS.getParentOperators().add(op);
    }
    pRS.getParentOperators().clear();
    pRS.getChildOperators().clear();
    return true;
}
Also used : SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 13 with SelectOperator

use of org.apache.hadoop.hive.ql.exec.SelectOperator in project hive by apache.

the class ColumnPrunerProcCtx method handleFilterUnionChildren.

/**
 * If the input filter operator has direct child(ren) which are union operator,
 * and the filter's column is not the same as union's
 * create select operator between them. The select operator has same number of columns as
 * pruned child operator.
 *
 * @param curOp
 *          The filter operator which need to handle children.
 * @throws SemanticException
 */
public void handleFilterUnionChildren(Operator<? extends OperatorDesc> curOp) throws SemanticException {
    if (curOp.getChildOperators() == null || !(curOp instanceof FilterOperator)) {
        return;
    }
    List<FieldNode> parentPrunList = prunedColLists.get(curOp);
    if (parentPrunList == null || parentPrunList.size() == 0) {
        return;
    }
    List<FieldNode> prunList = null;
    for (Operator<? extends OperatorDesc> child : curOp.getChildOperators()) {
        if (child instanceof UnionOperator) {
            prunList = genColLists(child);
            if (prunList == null || prunList.size() == 0 || parentPrunList.size() == prunList.size()) {
                continue;
            }
            ArrayList<ExprNodeDesc> exprs = new ArrayList<ExprNodeDesc>();
            ArrayList<String> outputColNames = new ArrayList<String>();
            Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
            ArrayList<ColumnInfo> outputRS = new ArrayList<ColumnInfo>();
            for (ColumnInfo colInfo : child.getSchema().getSignature()) {
                if (lookupColumn(prunList, colInfo.getInternalName()) == null) {
                    continue;
                }
                ExprNodeDesc colDesc = new ExprNodeColumnDesc(colInfo.getType(), colInfo.getInternalName(), colInfo.getTabAlias(), colInfo.getIsVirtualCol());
                exprs.add(colDesc);
                outputColNames.add(colInfo.getInternalName());
                ColumnInfo newCol = new ColumnInfo(colInfo.getInternalName(), colInfo.getType(), colInfo.getTabAlias(), colInfo.getIsVirtualCol(), colInfo.isHiddenVirtualCol());
                newCol.setAlias(colInfo.getAlias());
                outputRS.add(newCol);
                colExprMap.put(colInfo.getInternalName(), colDesc);
            }
            SelectDesc select = new SelectDesc(exprs, outputColNames, false);
            curOp.removeChild(child);
            SelectOperator sel = (SelectOperator) OperatorFactory.getAndMakeChild(select, new RowSchema(outputRS), curOp);
            OperatorFactory.makeChild(sel, child);
            sel.setColumnExprMap(colExprMap);
        }
    }
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SelectDesc(org.apache.hadoop.hive.ql.plan.SelectDesc)

Example 14 with SelectOperator

use of org.apache.hadoop.hive.ql.exec.SelectOperator in project hive by apache.

the class ConvertJoinMapJoin method removeCycleCreatingSemiJoinOps.

// Remove any semijoin branch associated with hashjoin's parent's operator
// pipeline which can cause a cycle after hashjoin optimization.
private void removeCycleCreatingSemiJoinOps(MapJoinOperator mapjoinOp, Operator<?> parentSelectOpOfBigTable, ParseContext parseContext) throws SemanticException {
    Map<ReduceSinkOperator, TableScanOperator> semiJoinMap = new HashMap<ReduceSinkOperator, TableScanOperator>();
    for (Operator<?> op : parentSelectOpOfBigTable.getChildOperators()) {
        if (!(op instanceof SelectOperator)) {
            continue;
        }
        while (op.getChildOperators().size() > 0) {
            op = op.getChildOperators().get(0);
        }
        // If not ReduceSink Op, skip
        if (!(op instanceof ReduceSinkOperator)) {
            continue;
        }
        ReduceSinkOperator rs = (ReduceSinkOperator) op;
        TableScanOperator ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
        if (ts == null) {
            // skip, no semijoin branch
            continue;
        }
        // Found a semijoin branch.
        // There can be more than one semijoin branch coming from the parent
        // GBY Operator of the RS Operator.
        Operator<?> parentGB = op.getParentOperators().get(0);
        for (Operator<?> childRS : parentGB.getChildOperators()) {
            // Get the RS and TS for this branch
            rs = (ReduceSinkOperator) childRS;
            ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
            assert ts != null;
            for (Operator<?> parent : mapjoinOp.getParentOperators()) {
                if (!(parent instanceof ReduceSinkOperator)) {
                    continue;
                }
                Set<TableScanOperator> tsOps = OperatorUtils.findOperatorsUpstream(parent, TableScanOperator.class);
                boolean found = false;
                for (TableScanOperator parentTS : tsOps) {
                    // If the parent is same as the ts, then we have a cycle.
                    if (ts == parentTS) {
                        semiJoinMap.put(rs, ts);
                        found = true;
                        break;
                    }
                }
                if (found)
                    break;
            }
        }
    }
    if (semiJoinMap.size() > 0) {
        for (ReduceSinkOperator rs : semiJoinMap.keySet()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Found semijoin optimization from the big table side of a map join, which will cause a task cycle. " + "Removing semijoin " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(semiJoinMap.get(rs)));
            }
            GenTezUtils.removeBranch(rs);
            GenTezUtils.removeSemiJoinOperator(parseContext, rs, semiJoinMap.get(rs));
        }
    }
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) HashMap(java.util.HashMap) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)

Example 15 with SelectOperator

use of org.apache.hadoop.hive.ql.exec.SelectOperator in project hive by apache.

the class DynamicPartitionPruningOptimization method getColumnInfo.

// Given a key, find the corresponding column name.
private boolean getColumnInfo(DynamicListContext ctx, StringBuilder internalColName, StringBuilder colName, StringBuilder tabAlias) {
    ExprNodeDesc exprNodeDesc = ctx.generator.getConf().getKeyCols().get(ctx.desc.getKeyIndex());
    ExprNodeColumnDesc colExpr = ExprNodeDescUtils.getColumnExpr(exprNodeDesc);
    if (colExpr == null) {
        return false;
    }
    internalColName.append(colExpr.getColumn());
    // fetch table ablias
    ExprNodeDescUtils.ColumnOrigin columnOrigin = ExprNodeDescUtils.findColumnOrigin(exprNodeDesc, ctx.generator);
    if (columnOrigin != null) {
        // get both tableAlias and column name from columnOrigin
        assert columnOrigin.op instanceof TableScanOperator;
        TableScanOperator ts = (TableScanOperator) columnOrigin.op;
        tabAlias.append(ts.getConf().getAlias());
        colName.append(ExprNodeDescUtils.getColumnExpr(columnOrigin.col).getColumn());
        return true;
    }
    Operator<? extends OperatorDesc> parentOfRS = ctx.generator.getParentOperators().get(0);
    if (!(parentOfRS instanceof SelectOperator)) {
        colName.append(internalColName.toString());
        return true;
    }
    exprNodeDesc = parentOfRS.getColumnExprMap().get(internalColName.toString());
    colExpr = ExprNodeDescUtils.getColumnExpr(exprNodeDesc);
    if (colExpr == null) {
        return false;
    }
    colName.append(ExprNodeDescUtils.extractColName(colExpr));
    return true;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDescUtils(org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc)

Aggregations

SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)31 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)20 ArrayList (java.util.ArrayList)14 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)13 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)12 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)11 SelectDesc (org.apache.hadoop.hive.ql.plan.SelectDesc)10 HashMap (java.util.HashMap)8 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)8 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)8 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)8 LinkedHashMap (java.util.LinkedHashMap)7 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)7 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)7 Operator (org.apache.hadoop.hive.ql.exec.Operator)7 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)7 Test (org.junit.Test)7 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)6 ExprNodeFieldDesc (org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc)6 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)4