Examples with JoinOperator - org.apache.hadoop.hive.ql.exec.JoinOperator

Example 6 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class GenSparkUtils method getEdgeProperty.

public static SparkEdgeProperty getEdgeProperty(ReduceSinkOperator reduceSink, ReduceWork reduceWork) throws SemanticException {
    SparkEdgeProperty edgeProperty = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE);
    edgeProperty.setNumPartitions(reduceWork.getNumReduceTasks());
    String sortOrder = Strings.nullToEmpty(reduceSink.getConf().getOrder()).trim();
    if (hasGBYOperator(reduceSink)) {
        edgeProperty.setShuffleGroup();
        // SHUFFLE_SORT shouldn't be used for this purpose, see HIVE-8542
        if (!sortOrder.isEmpty() && groupByNeedParLevelOrder(reduceSink)) {
            edgeProperty.setMRShuffle();
        }
    }
    if (reduceWork.getReducer() instanceof JoinOperator) {
        //reduce-side join, use MR-style shuffle
        edgeProperty.setMRShuffle();
    }
    //If its a FileSink to bucketed files, also use MR-style shuffle to
    // get compatible taskId for bucket-name
    FileSinkOperator fso = getChildOperator(reduceWork.getReducer(), FileSinkOperator.class);
    if (fso != null) {
        String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
        if (bucketCount != null && Integer.parseInt(bucketCount) > 1) {
            edgeProperty.setMRShuffle();
        }
    }
    // test if we need partition/global order, SHUFFLE_SORT should only be used for global order
    if (edgeProperty.isShuffleNone() && !sortOrder.isEmpty()) {
        if ((reduceSink.getConf().getPartitionCols() == null || reduceSink.getConf().getPartitionCols().isEmpty() || isSame(reduceSink.getConf().getPartitionCols(), reduceSink.getConf().getKeyCols())) && reduceSink.getConf().hasOrderBy()) {
            edgeProperty.setShuffleSort();
        } else {
            edgeProperty.setMRShuffle();
        }
    }
    // simple distribute-by goes here
    if (edgeProperty.isShuffleNone()) {
        edgeProperty.setShuffleGroup();
    }
    return edgeProperty;
}

Also used : SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty)

Example 7 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class ConvertJoinMapJoin method process.

@Override
public /*
   * (non-Javadoc) we should ideally not modify the tree we traverse. However,
   * since we need to walk the tree at any time when we modify the operator, we
   * might as well do it here.
   */
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    OptimizeTezProcContext context = (OptimizeTezProcContext) procCtx;
    JoinOperator joinOp = (JoinOperator) nd;
    long maxSize = context.conf.getLongVar(HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
    TezBucketJoinProcCtx tezBucketJoinProcCtx = new TezBucketJoinProcCtx(context.conf);
    if (!context.conf.getBoolVar(HiveConf.ConfVars.HIVECONVERTJOIN)) {
        // we are just converting to a common merge join operator. The shuffle
        // join in map-reduce case.
        Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx);
        if (retval == null) {
            return retval;
        } else {
            fallbackToReduceSideJoin(joinOp, context);
            return null;
        }
    }
    // if we have traits, and table info is present in the traits, we know the
    // exact number of buckets. Else choose the largest number of estimated
    // reducers from the parent operators.
    int numBuckets = -1;
    if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)) {
        numBuckets = estimateNumBuckets(joinOp, true);
    } else {
        numBuckets = 1;
    }
    LOG.info("Estimated number of buckets " + numBuckets);
    int mapJoinConversionPos = getMapJoinConversionPos(joinOp, context, numBuckets, false, maxSize, true);
    if (mapJoinConversionPos < 0) {
        Object retval = checkAndConvertSMBJoin(context, joinOp, tezBucketJoinProcCtx);
        if (retval == null) {
            return retval;
        } else {
            // only case is full outer join with SMB enabled which is not possible. Convert to regular
            // join.
            fallbackToReduceSideJoin(joinOp, context);
            return null;
        }
    }
    if (numBuckets > 1) {
        if (context.conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_JOIN_BUCKET_MAPJOIN_TEZ)) {
            if (convertJoinBucketMapJoin(joinOp, context, mapJoinConversionPos, tezBucketJoinProcCtx)) {
                return null;
            }
        }
    }
    // check if we can convert to map join no bucket scaling.
    LOG.info("Convert to non-bucketed map join");
    if (numBuckets != 1) {
        mapJoinConversionPos = getMapJoinConversionPos(joinOp, context, 1, false, maxSize, true);
    }
    if (mapJoinConversionPos < 0) {
        // we are just converting to a common merge join operator. The shuffle
        // join in map-reduce case.
        fallbackToReduceSideJoin(joinOp, context);
        return null;
    }
    MapJoinOperator mapJoinOp = convertJoinMapJoin(joinOp, context, mapJoinConversionPos, true);
    // map join operator by default has no bucket cols and num of reduce sinks
    // reduced by 1
    mapJoinOp.setOpTraits(new OpTraits(null, -1, null, joinOp.getOpTraits().getNumReduceSinks()));
    mapJoinOp.setStatistics(joinOp.getStatistics());
    // propagate this change till the next RS
    for (Operator<? extends OperatorDesc> childOp : mapJoinOp.getChildOperators()) {
        setAllChildrenTraits(childOp, mapJoinOp.getOpTraits());
    }
    return null;
}

Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) OpTraits(org.apache.hadoop.hive.ql.plan.OpTraits) OptimizeTezProcContext(org.apache.hadoop.hive.ql.parse.OptimizeTezProcContext)

Example 8 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class CommonJoinTaskDispatcher method convertTaskToMapJoinTask.

// create map join task and set big table as bigTablePosition
private MapRedTask convertTaskToMapJoinTask(MapredWork newWork, int bigTablePosition) throws UnsupportedEncodingException, SemanticException {
    // create a mapred task for this work
    MapRedTask newTask = (MapRedTask) TaskFactory.get(newWork, physicalContext.getParseContext().getConf());
    JoinOperator newJoinOp = getJoinOp(newTask);
    // optimize this newWork given the big table position
    MapJoinProcessor.genMapJoinOpAndLocalWork(physicalContext.getParseContext().getConf(), newWork, newJoinOp, bigTablePosition);
    return newTask;
}

Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator)

Example 9 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class CommonJoinTaskDispatcher method getJoinOp.

private JoinOperator getJoinOp(MapRedTask task) throws SemanticException {
    MapWork mWork = task.getWork().getMapWork();
    ReduceWork rWork = task.getWork().getReduceWork();
    if (rWork == null) {
        return null;
    }
    Operator<? extends OperatorDesc> reducerOp = rWork.getReducer();
    if (reducerOp instanceof JoinOperator) {
        /* Is any operator present, which prevents the conversion */
        Map<String, Operator<? extends OperatorDesc>> aliasToWork = mWork.getAliasToWork();
        for (Operator<? extends OperatorDesc> op : aliasToWork.values()) {
            if (!checkOperatorOKMapJoinConversion(op)) {
                return null;
            }
        }
        return (JoinOperator) reducerOp;
    } else {
        return null;
    }
}

Also used : JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) LateralViewForwardOperator(org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 10 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class CrossProductCheck method checkMRReducer.

private void checkMRReducer(String taskName, MapredWork mrWrk) throws SemanticException {
    ReduceWork rWrk = mrWrk.getReduceWork();
    if (rWrk == null) {
        return;
    }
    Operator<? extends OperatorDesc> reducer = rWrk.getReducer();
    if (reducer instanceof JoinOperator || reducer instanceof CommonMergeJoinOperator) {
        BaseWork prntWork = mrWrk.getMapWork();
        checkForCrossProduct(taskName, reducer, new ExtractReduceSinkInfo(null).analyze(prntWork));
    }
}

Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)

Aggregations

JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)32 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)18 Operator (org.apache.hadoop.hive.ql.exec.Operator)18 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)15 ArrayList (java.util.ArrayList)14 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)14 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)12 HashMap (java.util.HashMap)11 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)11 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)11 List (java.util.List)9 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)9 JoinDesc (org.apache.hadoop.hive.ql.plan.JoinDesc)9 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)8 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)8 HashSet (java.util.HashSet)7 LinkedHashMap (java.util.LinkedHashMap)7 Path (org.apache.hadoop.fs.Path)7 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)7 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)7