Examples with JoinOperator - org.apache.hadoop.hive.ql.exec.JoinOperator

Example 1 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class GenMapRedUtils method splitTasks.

@SuppressWarnings("nls")
private static /**
   * Split two tasks by creating a temporary file between them.
   *
   * @param op reduce sink operator being processed
   * @param parentTask the parent task
   * @param childTask the child task
   * @param opProcCtx context
   **/
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
    if (op.getNumParent() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
    }
    ParseContext parseCtx = opProcCtx.getParseCtx();
    parentTask.addDependentTask(childTask);
    // Root Task cannot depend on any other task, therefore childTask cannot be
    // a root Task
    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
    if (rootTasks.contains(childTask)) {
        rootTasks.remove(childTask);
    }
    // Generate the temporary file name
    Context baseCtx = parseCtx.getContext();
    Path taskTmpDir = baseCtx.getMRTmpPath();
    Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
    TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
    // Create the temporary file, its corresponding FileSinkOperaotr, and
    // its corresponding TableScanOperator.
    TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
    String streamDesc = taskTmpDir.toUri().toString();
    MapredWork cplan = (MapredWork) childTask.getWork();
    if (needsTagging(cplan.getReduceWork())) {
        Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
        String id = null;
        if (reducerOp instanceof JoinOperator) {
            if (parseCtx.getJoinOps().contains(reducerOp)) {
                id = ((JoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof MapJoinOperator) {
            if (parseCtx.getMapJoinOps().contains(reducerOp)) {
                id = ((MapJoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof SMBMapJoinOperator) {
            if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
                id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
            }
        }
        if (id != null) {
            streamDesc = id + ":$INTNAME";
        } else {
            streamDesc = "$INTNAME";
        }
        String origStreamDesc = streamDesc;
        int pos = 0;
        while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
            streamDesc = origStreamDesc.concat(String.valueOf(++pos));
        }
        // TODO: Allocate work to remove the temporary files and make that
        // dependent on the redTask
        cplan.getReduceWork().setNeedsTagging(true);
    }
    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrAliasId(null);
    opProcCtx.setCurrTask(childTask);
    opProcCtx.addRootIfPossible(parentTask);
}

Also used : ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 2 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class SparkMapJoinProcessor method convertMapJoin.

/**
   * convert a regular join to a a map-side join.
   *
   * @param conf
   * @param opParseCtxMap
   * @param op join operator
   * @param joinTree qb join tree
   * @param bigTablePos position of the source to be read as part of
   *                   map-reduce framework. All other sources are cached in memory
   * @param noCheckOuterJoin
   * @param validateMapJoinTree
   */
@Override
public MapJoinOperator convertMapJoin(HiveConf conf, JoinOperator op, boolean leftSrc, String[] baseSrc, List<String> mapAliases, int bigTablePos, boolean noCheckOuterJoin, boolean validateMapJoinTree) throws SemanticException {
    // outer join cannot be performed on a table which is being cached
    JoinCondDesc[] condns = op.getConf().getConds();
    if (!noCheckOuterJoin) {
        if (checkMapJoin(bigTablePos, condns) < 0) {
            throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
        }
    }
    // create the map-join operator
    MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, op, op.getConf().isLeftInputJoin(), op.getConf().getBaseSrc(), op.getConf().getMapAliases(), bigTablePos, noCheckOuterJoin);
    // 1. remove RS as parent for the big table branch
    // 2. remove old join op from child set of all the RSs
    List<Operator<? extends OperatorDesc>> parentOps = mapJoinOp.getParentOperators();
    for (int i = 0; i < parentOps.size(); i++) {
        Operator<? extends OperatorDesc> parentOp = parentOps.get(i);
        parentOp.getChildOperators().remove(op);
        if (i == bigTablePos) {
            List<Operator<? extends OperatorDesc>> grandParentOps = parentOp.getParentOperators();
            Preconditions.checkArgument(grandParentOps.size() == 1, "AssertionError: expect number of parents to be 1, but was " + grandParentOps.size());
            Operator<? extends OperatorDesc> grandParentOp = grandParentOps.get(0);
            grandParentOp.replaceChild(parentOp, mapJoinOp);
            mapJoinOp.replaceParent(parentOp, grandParentOp);
        }
    }
    return mapJoinOp;
}

Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JoinCondDesc(org.apache.hadoop.hive.ql.plan.JoinCondDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 3 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class JoinReorder method transform.

/**
   * Transform the query tree. For each join, check which reduce sink will
   * output the biggest result (based on STREAMTABLE hints) and give it the
   * biggest tag so that it gets streamed.
   *
   * @param pactx
   *          current parse context
   */
@Override
public ParseContext transform(ParseContext pactx) throws SemanticException {
    Set<String> bigTables = getBigTables(pactx);
    cache.clear();
    for (JoinOperator joinOp : pactx.getJoinOps()) {
        reorder(joinOp, bigTables);
    }
    return pactx;
}

Also used : JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator)

Example 4 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class CorrelationOptimizer method findPossibleAutoConvertedJoinOperators.

private void findPossibleAutoConvertedJoinOperators() throws SemanticException {
    // based on hive.auto.convert.join.noconditionaltask.size.
    for (JoinOperator joinOp : pCtx.getJoinOps()) {
        boolean isAbleToGuess = true;
        boolean mayConvert = false;
        // Get total size and individual alias's size
        long aliasTotalKnownInputSize = 0;
        Map<String, Long> aliasToSize = new HashMap<String, Long>();
        Map<Integer, Set<String>> posToAliases = new HashMap<Integer, Set<String>>();
        for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
            Operator<? extends OperatorDesc> op = joinOp.getParentOperators().get(pos);
            Set<TableScanOperator> topOps = CorrelationUtilities.findTableScanOperators(op);
            if (topOps.isEmpty()) {
                isAbleToGuess = false;
                break;
            }
            Set<String> aliases = new LinkedHashSet<String>();
            for (TableScanOperator tsop : topOps) {
                Table table = tsop.getConf().getTableMetadata();
                if (table == null) {
                    // table should not be null.
                    throw new SemanticException("The table of " + tsop.getName() + " " + tsop.getIdentifier() + " is null, which is not expected.");
                }
                String alias = tsop.getConf().getAlias();
                aliases.add(alias);
                Path p = table.getPath();
                ContentSummary resultCs = null;
                try {
                    FileSystem fs = table.getPath().getFileSystem(pCtx.getConf());
                    resultCs = fs.getContentSummary(p);
                } catch (IOException e) {
                    LOG.warn("Encounter a error while querying content summary of table " + table.getCompleteName() + " from FileSystem. " + "Cannot guess if CommonJoinOperator will optimize " + joinOp.getName() + " " + joinOp.getIdentifier());
                }
                if (resultCs == null) {
                    isAbleToGuess = false;
                    break;
                }
                long size = resultCs.getLength();
                aliasTotalKnownInputSize += size;
                Long es = aliasToSize.get(alias);
                if (es == null) {
                    es = new Long(0);
                }
                es += size;
                aliasToSize.put(alias, es);
            }
            posToAliases.put(pos, aliases);
        }
        if (!isAbleToGuess) {
            LOG.info("Cannot guess if CommonJoinOperator will optimize " + joinOp.getName() + " " + joinOp.getIdentifier());
            continue;
        }
        JoinDesc joinDesc = joinOp.getConf();
        Byte[] order = joinDesc.getTagOrder();
        int numAliases = order.length;
        Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
        if (bigTableCandidates.isEmpty()) {
            continue;
        }
        long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(pCtx.getConf(), HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
        for (int i = 0; i < numAliases; i++) {
            // this table cannot be big table
            if (!bigTableCandidates.contains(i)) {
                continue;
            }
            Set<String> aliases = posToAliases.get(i);
            long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
            if (!CommonJoinTaskDispatcher.cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
                mayConvert = true;
            }
        }
        if (mayConvert) {
            LOG.info(joinOp.getName() + " " + joinOp.getIdentifier() + " may be converted to MapJoin by CommonJoinResolver");
            skipedJoinOperators.add(joinOp);
        }
    }
}

Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) LinkedHashSet(java.util.LinkedHashSet) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Set(java.util.Set) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) FileSystem(org.apache.hadoop.fs.FileSystem) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) Table(org.apache.hadoop.hive.ql.metadata.Table) IOException(java.io.IOException) ContentSummary(org.apache.hadoop.fs.ContentSummary) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc)

Example 5 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class HiveOpConverter method translateJoin.

private OpAttr translateJoin(RelNode joinRel) throws SemanticException {
    // 0. Additional data structures needed for the join optimization
    // through Hive
    String[] baseSrc = new String[joinRel.getInputs().size()];
    String tabAlias = getHiveDerivedTableAlias();
    // 1. Convert inputs
    OpAttr[] inputs = new OpAttr[joinRel.getInputs().size()];
    List<Operator<?>> children = new ArrayList<Operator<?>>(joinRel.getInputs().size());
    for (int i = 0; i < inputs.length; i++) {
        inputs[i] = dispatch(joinRel.getInput(i));
        children.add(inputs[i].inputs.get(0));
        baseSrc[i] = inputs[i].tabAlias;
    }
    // 2. Generate tags
    for (int tag = 0; tag < children.size(); tag++) {
        ReduceSinkOperator reduceSinkOp = (ReduceSinkOperator) children.get(tag);
        reduceSinkOp.getConf().setTag(tag);
    }
    // 3. Virtual columns
    Set<Integer> newVcolsInCalcite = new HashSet<Integer>();
    newVcolsInCalcite.addAll(inputs[0].vcolsInCalcite);
    if (joinRel instanceof HiveMultiJoin || !(joinRel instanceof SemiJoin)) {
        int shift = inputs[0].inputs.get(0).getSchema().getSignature().size();
        for (int i = 1; i < inputs.length; i++) {
            newVcolsInCalcite.addAll(HiveCalciteUtil.shiftVColsSet(inputs[i].vcolsInCalcite, shift));
            shift += inputs[i].inputs.get(0).getSchema().getSignature().size();
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Translating operator rel#" + joinRel.getId() + ":" + joinRel.getRelTypeName() + " with row type: [" + joinRel.getRowType() + "]");
    }
    // 4. Extract join key expressions from HiveSortExchange
    ExprNodeDesc[][] joinExpressions = new ExprNodeDesc[inputs.length][];
    for (int i = 0; i < inputs.length; i++) {
        joinExpressions[i] = ((HiveSortExchange) joinRel.getInput(i)).getJoinExpressions();
    }
    // 5. Extract rest of join predicate info. We infer the rest of join condition
    //    that will be added to the filters (join conditions that are not part of
    //    the join key)
    List<RexNode> joinFilters;
    if (joinRel instanceof HiveJoin) {
        joinFilters = ImmutableList.of(((HiveJoin) joinRel).getJoinFilter());
    } else if (joinRel instanceof HiveMultiJoin) {
        joinFilters = ((HiveMultiJoin) joinRel).getJoinFilters();
    } else if (joinRel instanceof HiveSemiJoin) {
        joinFilters = ImmutableList.of(((HiveSemiJoin) joinRel).getJoinFilter());
    } else {
        throw new SemanticException("Can't handle join type: " + joinRel.getClass().getName());
    }
    List<List<ExprNodeDesc>> filterExpressions = Lists.newArrayList();
    for (int i = 0; i < joinFilters.size(); i++) {
        List<ExprNodeDesc> filterExpressionsForInput = new ArrayList<ExprNodeDesc>();
        if (joinFilters.get(i) != null) {
            for (RexNode conj : RelOptUtil.conjunctions(joinFilters.get(i))) {
                ExprNodeDesc expr = convertToExprNode(conj, joinRel, null, newVcolsInCalcite);
                filterExpressionsForInput.add(expr);
            }
        }
        filterExpressions.add(filterExpressionsForInput);
    }
    // 6. Generate Join operator
    JoinOperator joinOp = genJoin(joinRel, joinExpressions, filterExpressions, children, baseSrc, tabAlias);
    // 7. Return result
    return new OpAttr(tabAlias, newVcolsInCalcite, joinOp);
}

Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) HiveMultiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveMultiJoin) ArrayList(java.util.ArrayList) HiveJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveJoin) SemiJoin(org.apache.calcite.rel.core.SemiJoin) HiveSemiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) HiveSemiJoin(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSemiJoin) RexNode(org.apache.calcite.rex.RexNode)

Aggregations

JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)32 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)18 Operator (org.apache.hadoop.hive.ql.exec.Operator)18 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)15 ArrayList (java.util.ArrayList)14 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)14 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)12 HashMap (java.util.HashMap)11 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)11 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)11 List (java.util.List)9 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)9 JoinDesc (org.apache.hadoop.hive.ql.plan.JoinDesc)9 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)8 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)8 HashSet (java.util.HashSet)7 LinkedHashMap (java.util.LinkedHashMap)7 Path (org.apache.hadoop.fs.Path)7 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)7 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)7