Search in sources :

Example 36 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SharedWorkOptimizer method compareOperator.

private static boolean compareOperator(ParseContext pctx, Operator<?> op1, Operator<?> op2) throws SemanticException {
    if (!op1.getClass().getName().equals(op2.getClass().getName())) {
        return false;
    }
    // TODO: move this to logicalEquals
    if (op1 instanceof ReduceSinkOperator) {
        ReduceSinkDesc op1Conf = ((ReduceSinkOperator) op1).getConf();
        ReduceSinkDesc op2Conf = ((ReduceSinkOperator) op2).getConf();
        if (StringUtils.equals(op1Conf.getKeyColString(), op2Conf.getKeyColString()) && StringUtils.equals(op1Conf.getValueColsString(), op2Conf.getValueColsString()) && StringUtils.equals(op1Conf.getParitionColsString(), op2Conf.getParitionColsString()) && op1Conf.getTag() == op2Conf.getTag() && StringUtils.equals(op1Conf.getOrder(), op2Conf.getOrder()) && op1Conf.getTopN() == op2Conf.getTopN() && op1Conf.isAutoParallel() == op2Conf.isAutoParallel()) {
            return true;
        } else {
            return false;
        }
    }
    // TODO: move this to logicalEquals
    if (op1 instanceof TableScanOperator) {
        TableScanOperator tsOp1 = (TableScanOperator) op1;
        TableScanOperator tsOp2 = (TableScanOperator) op2;
        TableScanDesc op1Conf = tsOp1.getConf();
        TableScanDesc op2Conf = tsOp2.getConf();
        Table tableMeta1 = op1Conf.getTableMetadata();
        Table tableMeta2 = op2Conf.getTableMetadata();
        if (StringUtils.equals(tableMeta1.getFullyQualifiedName(), tableMeta2.getFullyQualifiedName()) && op1Conf.getNeededColumns().equals(op2Conf.getNeededColumns()) && StringUtils.equals(op1Conf.getFilterExprString(), op2Conf.getFilterExprString()) && pctx.getPrunedPartitions(tsOp1).getPartitions().equals(pctx.getPrunedPartitions(tsOp2).getPartitions()) && op1Conf.getRowLimit() == op2Conf.getRowLimit()) {
            return true;
        } else {
            return false;
        }
    }
    return op1.logicalEquals(op2);
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 37 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SharedWorkOptimizer method gatherDPPTableScanOps.

/**
 * This method gathers the TS operators with DPP from the context and
 * stores them into the input optimization cache.
 */
private static void gatherDPPTableScanOps(ParseContext pctx, SharedWorkOptimizerCache optimizerCache) throws SemanticException {
    // Find TS operators with partition pruning enabled in plan
    // because these TS may potentially read different data for
    // different pipeline.
    // These can be:
    // 1) TS with DPP.
    // 2) TS with semijoin DPP.
    Map<String, TableScanOperator> topOps = pctx.getTopOps();
    Collection<Operator<? extends OperatorDesc>> tableScanOps = Lists.<Operator<?>>newArrayList(topOps.values());
    Set<AppMasterEventOperator> s = OperatorUtils.findOperators(tableScanOps, AppMasterEventOperator.class);
    for (AppMasterEventOperator a : s) {
        if (a.getConf() instanceof DynamicPruningEventDesc) {
            DynamicPruningEventDesc dped = (DynamicPruningEventDesc) a.getConf();
            optimizerCache.tableScanToDPPSource.put(dped.getTableScan(), a);
        }
    }
    for (Entry<ReduceSinkOperator, SemiJoinBranchInfo> e : pctx.getRsToSemiJoinBranchInfo().entrySet()) {
        optimizerCache.tableScanToDPPSource.put(e.getValue().getTsOp(), e.getKey());
    }
    LOG.debug("DPP information stored in the cache: {}", optimizerCache.tableScanToDPPSource);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)

Example 38 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class ConvertJoinMapJoin method removeCycleCreatingSemiJoinOps.

// Remove any semijoin branch associated with hashjoin's parent's operator
// pipeline which can cause a cycle after hashjoin optimization.
private void removeCycleCreatingSemiJoinOps(MapJoinOperator mapjoinOp, Operator<?> parentSelectOpOfBigTable, ParseContext parseContext) throws SemanticException {
    Map<ReduceSinkOperator, TableScanOperator> semiJoinMap = new HashMap<ReduceSinkOperator, TableScanOperator>();
    for (Operator<?> op : parentSelectOpOfBigTable.getChildOperators()) {
        if (!(op instanceof SelectOperator)) {
            continue;
        }
        while (op.getChildOperators().size() > 0) {
            op = op.getChildOperators().get(0);
        }
        // If not ReduceSink Op, skip
        if (!(op instanceof ReduceSinkOperator)) {
            continue;
        }
        ReduceSinkOperator rs = (ReduceSinkOperator) op;
        TableScanOperator ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
        if (ts == null) {
            // skip, no semijoin branch
            continue;
        }
        // Found a semijoin branch.
        // There can be more than one semijoin branch coming from the parent
        // GBY Operator of the RS Operator.
        Operator<?> parentGB = op.getParentOperators().get(0);
        for (Operator<?> childRS : parentGB.getChildOperators()) {
            // Get the RS and TS for this branch
            rs = (ReduceSinkOperator) childRS;
            ts = parseContext.getRsToSemiJoinBranchInfo().get(rs).getTsOp();
            assert ts != null;
            for (Operator<?> parent : mapjoinOp.getParentOperators()) {
                if (!(parent instanceof ReduceSinkOperator)) {
                    continue;
                }
                Set<TableScanOperator> tsOps = OperatorUtils.findOperatorsUpstream(parent, TableScanOperator.class);
                boolean found = false;
                for (TableScanOperator parentTS : tsOps) {
                    // If the parent is same as the ts, then we have a cycle.
                    if (ts == parentTS) {
                        semiJoinMap.put(rs, ts);
                        found = true;
                        break;
                    }
                }
                if (found)
                    break;
            }
        }
    }
    if (semiJoinMap.size() > 0) {
        for (ReduceSinkOperator rs : semiJoinMap.keySet()) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Found semijoin optimization from the big table side of a map join, which will cause a task cycle. " + "Removing semijoin " + OperatorUtils.getOpNamePretty(rs) + " - " + OperatorUtils.getOpNamePretty(semiJoinMap.get(rs)));
            }
            GenTezUtils.removeBranch(rs);
            GenTezUtils.removeSemiJoinOperator(parseContext, rs, semiJoinMap.get(rs));
        }
    }
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) HashMap(java.util.HashMap) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)

Example 39 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class ConvertJoinMapJoin method checkConvertJoinSMBJoin.

/*
   * This method tries to convert a join to an SMB. This is done based on
   * traits. If the sorted by columns are the same as the join columns then, we
   * can convert the join to an SMB. Otherwise retain the bucket map join as it
   * is still more efficient than a regular join.
   */
private boolean checkConvertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context, int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
    ReduceSinkOperator bigTableRS = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
    int numBuckets = bigTableRS.getParentOperators().get(0).getOpTraits().getNumBuckets();
    int size = -1;
    for (Operator<?> parentOp : joinOp.getParentOperators()) {
        // each side better have 0 or more RS. if either side is unbalanced, cannot convert.
        // This is a workaround for now. Right fix would be to refactor code in the
        // MapRecordProcessor and ReduceRecordProcessor with respect to the sources.
        Set<ReduceSinkOperator> set = OperatorUtils.findOperatorsUpstream(parentOp.getParentOperators(), ReduceSinkOperator.class);
        if (size < 0) {
            size = set.size();
            continue;
        }
        if (((size > 0) && (set.size() > 0)) || ((size == 0) && (set.size() == 0))) {
            continue;
        } else {
            return false;
        }
    }
    // transformation of the join operation
    for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
        if (!(parentOp instanceof ReduceSinkOperator)) {
            // could be mux/demux operators. Currently not supported
            LOG.info("Found correlation optimizer operators. Cannot convert to SMB at this time.");
            return false;
        }
        ReduceSinkOperator rsOp = (ReduceSinkOperator) parentOp;
        if (!checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getSortCols(), rsOp.getOpTraits().getSortCols(), rsOp.getColumnExprMap(), false)) {
            LOG.info("We cannot convert to SMB because the sort column names do not match.");
            return false;
        }
        if (!checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getBucketColNames(), rsOp.getOpTraits().getBucketColNames(), rsOp.getColumnExprMap(), true)) {
            LOG.info("We cannot convert to SMB because bucket column names do not match.");
            return false;
        }
    }
    if (numBuckets < 0) {
        numBuckets = bigTableRS.getConf().getNumReducers();
    }
    tezBucketJoinProcCtx.setNumBuckets(numBuckets);
    LOG.info("We can convert the join to an SMB join.");
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)

Example 40 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class ConvertJoinMapJoin method checkConvertJoinBucketMapJoin.

/*
   * If the parent reduce sink of the big table side has the same emit key cols as its parent, we
   * can create a bucket map join eliminating the reduce sink.
   */
private boolean checkConvertJoinBucketMapJoin(JoinOperator joinOp, int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
    // constituent reduce sinks
    if (!(joinOp.getParentOperators().get(0) instanceof ReduceSinkOperator)) {
        LOG.info("Operator is " + joinOp.getParentOperators().get(0).getName() + ". Cannot convert to bucket map join");
        return false;
    }
    ReduceSinkOperator rs = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
    List<List<String>> parentColNames = rs.getOpTraits().getBucketColNames();
    Operator<? extends OperatorDesc> parentOfParent = rs.getParentOperators().get(0);
    List<List<String>> grandParentColNames = parentOfParent.getOpTraits().getBucketColNames();
    int numBuckets = parentOfParent.getOpTraits().getNumBuckets();
    // all keys matched.
    if (!checkColEquality(grandParentColNames, parentColNames, rs.getColumnExprMap(), true)) {
        LOG.info("No info available to check for bucket map join. Cannot convert");
        return false;
    }
    /*
     * this is the case when the big table is a sub-query and is probably already bucketed by the
     * join column in say a group by operation
     */
    if (numBuckets < 0) {
        numBuckets = rs.getConf().getNumReducers();
    }
    tezBucketJoinProcCtx.setNumBuckets(numBuckets);
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) List(java.util.List) ArrayList(java.util.ArrayList)

Aggregations

ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)86 Operator (org.apache.hadoop.hive.ql.exec.Operator)50 ArrayList (java.util.ArrayList)48 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)45 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)35 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)31 HashMap (java.util.HashMap)29 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)28 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)27 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)26 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)26 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)25 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)24 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)23 List (java.util.List)19 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)19 LinkedHashMap (java.util.LinkedHashMap)18 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)18 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)18 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)15