Examples with MapJoinOperator - org.apache.hadoop.hive.ql.exec.MapJoinOperator

Example 16 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class GenMapRedUtils method splitTasks.

@SuppressWarnings("nls")
private static /**
 * Split two tasks by creating a temporary file between them.
 *
 * @param op reduce sink operator being processed
 * @param parentTask the parent task
 * @param childTask the child task
 * @param opProcCtx context
 */
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
    if (op.getNumParent() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
    }
    ParseContext parseCtx = opProcCtx.getParseCtx();
    parentTask.addDependentTask(childTask);
    // Root Task cannot depend on any other task, therefore childTask cannot be
    // a root Task
    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
    if (rootTasks.contains(childTask)) {
        rootTasks.remove(childTask);
    }
    // Generate the temporary file name
    Context baseCtx = parseCtx.getContext();
    Path taskTmpDir = baseCtx.getMRTmpPath();
    Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
    TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
    // Create the temporary file, its corresponding FileSinkOperaotr, and
    // its corresponding TableScanOperator.
    TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
    String streamDesc = taskTmpDir.toUri().toString();
    MapredWork cplan = (MapredWork) childTask.getWork();
    if (needsTagging(cplan.getReduceWork())) {
        Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
        String id = null;
        if (reducerOp instanceof JoinOperator) {
            if (parseCtx.getJoinOps().contains(reducerOp)) {
                id = ((JoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof MapJoinOperator) {
            if (parseCtx.getMapJoinOps().contains(reducerOp)) {
                id = ((MapJoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof SMBMapJoinOperator) {
            if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
                id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
            }
        }
        if (id != null) {
            streamDesc = id + ":$INTNAME";
        } else {
            streamDesc = "$INTNAME";
        }
        String origStreamDesc = streamDesc;
        int pos = 0;
        while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
            streamDesc = origStreamDesc.concat(String.valueOf(++pos));
        }
        // TODO: Allocate work to remove the temporary files and make that
        // dependent on the redTask
        cplan.getReduceWork().setNeedsTagging(true);
    }
    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrAliasId(null);
    opProcCtx.setCurrTask(childTask);
    opProcCtx.addRootIfPossible(parentTask);
}

Also used : ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 17 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class MapJoinProcessor method genMapJoinOpAndLocalWork.

/**
 * Convert the join to a map-join and also generate any local work needed.
 *
 * @param newWork MapredWork in which the conversion is to happen
 * @param op
 *          The join operator that needs to be converted to map-join
 * @param mapJoinPos
 * @throws SemanticException
 */
public static void genMapJoinOpAndLocalWork(HiveConf conf, MapredWork newWork, JoinOperator op, int mapJoinPos) throws SemanticException {
    // generate the map join operator; already checked the map join
    MapJoinOperator newMapJoinOp = new MapJoinProcessor().convertMapJoin(conf, op, newWork.getMapWork().isLeftInputJoin(), newWork.getMapWork().getBaseSrc(), newWork.getMapWork().getMapAliases(), mapJoinPos, true, false);
    genLocalWorkForMapJoin(newWork, newMapJoinOp, mapJoinPos);
}

Example 18 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class MapJoinProcessor method convertSMBJoinToMapJoin.

/**
 * convert a sortmerge join to a a map-side join.
 *
 * @param opParseCtxMap
 * @param smbJoinOp
 *          join operator
 * @param joinTree
 *          qb join tree
 * @param bigTablePos
 *          position of the source to be read as part of map-reduce framework. All other sources
 *          are cached in memory
 * @param noCheckOuterJoin
 */
public static MapJoinOperator convertSMBJoinToMapJoin(HiveConf hconf, SMBMapJoinOperator smbJoinOp, int bigTablePos, boolean noCheckOuterJoin) throws SemanticException {
    // Create a new map join operator
    SMBJoinDesc smbJoinDesc = smbJoinOp.getConf();
    List<ExprNodeDesc> keyCols = smbJoinDesc.getKeys().get(Byte.valueOf((byte) 0));
    TableDesc keyTableDesc = PlanUtils.getMapJoinKeyTableDesc(hconf, PlanUtils.getFieldSchemasFromColumnList(keyCols, MAPJOINKEY_FIELDPREFIX));
    MapJoinDesc mapJoinDesc = new MapJoinDesc(smbJoinDesc.getKeys(), keyTableDesc, smbJoinDesc.getExprs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getValueTblDescs(), smbJoinDesc.getOutputColumnNames(), bigTablePos, smbJoinDesc.getConds(), smbJoinDesc.getFilters(), smbJoinDesc.isNoOuterJoin(), smbJoinDesc.getDumpFilePrefix(), smbJoinDesc.getMemoryMonitorInfo(), smbJoinDesc.getInMemoryDataSize());
    mapJoinDesc.setStatistics(smbJoinDesc.getStatistics());
    mapJoinDesc.setColumnExprMap(smbJoinDesc.getColumnExprMap());
    RowSchema joinRS = smbJoinOp.getSchema();
    // The mapjoin has the same schema as the join operator
    MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(smbJoinOp.getCompilationOpContext(), mapJoinDesc, joinRS, new ArrayList<Operator<? extends OperatorDesc>>());
    // change the children of the original join operator to point to the map
    // join operator
    List<Operator<? extends OperatorDesc>> childOps = smbJoinOp.getChildOperators();
    for (Operator<? extends OperatorDesc> childOp : childOps) {
        childOp.replaceParent(smbJoinOp, mapJoinOp);
    }
    mapJoinOp.setChildOperators(childOps);
    smbJoinOp.setChildOperators(null);
    // change the parent of the original SMBjoin operator to point to the map
    // join operator
    List<Operator<? extends OperatorDesc>> parentOps = smbJoinOp.getParentOperators();
    for (Operator<? extends OperatorDesc> parentOp : parentOps) {
        parentOp.replaceChild(smbJoinOp, mapJoinOp);
    }
    mapJoinOp.setParentOperators(parentOps);
    smbJoinOp.setParentOperators(null);
    return mapJoinOp;
}

Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) LateralViewJoinOperator(org.apache.hadoop.hive.ql.exec.LateralViewJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ScriptOperator(org.apache.hadoop.hive.ql.exec.ScriptOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) ArrayList(java.util.ArrayList) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 19 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class SharedWorkOptimizer method extractSharedOptimizationInfo.

private static SharedResult extractSharedOptimizationInfo(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, Operator<?> retainableOpEqualParent, Operator<?> discardableOpEqualParent, Operator<?> retainableOp, Operator<?> discardableOp, LinkedHashSet<Operator<?>> retainableOps, LinkedHashSet<Operator<?>> discardableOps, Set<Operator<?>> discardableInputOps, boolean removeInputBranch) throws SemanticException {
    Operator<?> equalOp1 = retainableOpEqualParent;
    Operator<?> equalOp2 = discardableOpEqualParent;
    Operator<?> currentOp1 = retainableOp;
    Operator<?> currentOp2 = discardableOp;
    long dataSize = 0L;
    long maxDataSize = 0L;
    // Try to merge rest of operators
    while (!(currentOp1 instanceof ReduceSinkOperator)) {
        // Check whether current operators are equal
        if (!compareOperator(pctx, currentOp1, currentOp2)) {
            // If they are not equal, we could zip up till here
            break;
        }
        if (currentOp1.getParentOperators().size() != currentOp2.getParentOperators().size()) {
            // If they are not equal, we could zip up till here
            break;
        }
        if (currentOp1.getParentOperators().size() > 1) {
            List<Operator<?>> discardableOpsForCurrentOp = new ArrayList<>();
            int idx = 0;
            for (; idx < currentOp1.getParentOperators().size(); idx++) {
                Operator<?> parentOp1 = currentOp1.getParentOperators().get(idx);
                Operator<?> parentOp2 = currentOp2.getParentOperators().get(idx);
                if (parentOp1 == equalOp1 && parentOp2 == equalOp2 && !removeInputBranch) {
                    continue;
                }
                if ((parentOp1 == equalOp1 && parentOp2 != equalOp2) || (parentOp1 != equalOp1 && parentOp2 == equalOp2)) {
                    // Input operator is not in the same position
                    break;
                }
                // Compare input
                List<Operator<?>> removeOpsForCurrentInput = compareAndGatherOps(pctx, parentOp1, parentOp2);
                if (removeOpsForCurrentInput == null) {
                    // Inputs are not the same, bail out
                    break;
                }
                // Add inputs to ops to remove
                discardableOpsForCurrentOp.addAll(removeOpsForCurrentInput);
            }
            if (idx != currentOp1.getParentOperators().size()) {
                // If inputs are not equal, we could zip up till here
                break;
            }
            discardableInputOps.addAll(discardableOpsForCurrentOp);
        }
        equalOp1 = currentOp1;
        equalOp2 = currentOp2;
        retainableOps.add(equalOp1);
        discardableOps.add(equalOp2);
        if (equalOp1 instanceof MapJoinOperator) {
            MapJoinOperator mop = (MapJoinOperator) equalOp1;
            dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
            maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
        }
        if (currentOp1.getChildOperators().size() > 1 || currentOp2.getChildOperators().size() > 1) {
            // TODO: Support checking multiple child operators to merge further.
            break;
        }
        // Update for next iteration
        currentOp1 = currentOp1.getChildOperators().get(0);
        currentOp2 = currentOp2.getChildOperators().get(0);
    }
    // Add the rest to the memory consumption
    Set<Operator<?>> opsWork1 = findWorkOperators(optimizerCache, currentOp1);
    for (Operator<?> op : opsWork1) {
        if (op instanceof MapJoinOperator && !retainableOps.contains(op)) {
            MapJoinOperator mop = (MapJoinOperator) op;
            dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
            maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
        }
    }
    Set<Operator<?>> opsWork2 = findWorkOperators(optimizerCache, currentOp2);
    for (Operator<?> op : opsWork2) {
        if (op instanceof MapJoinOperator && !discardableOps.contains(op)) {
            MapJoinOperator mop = (MapJoinOperator) op;
            dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
            maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
        }
    }
    discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableInputOps));
    discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableOps));
    discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, retainableOps, discardableInputOps));
    return new SharedResult(retainableOps, discardableOps, discardableInputOps, dataSize, maxDataSize);
}

Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList)

Example 20 with MapJoinOperator

use of org.apache.hadoop.hive.ql.exec.MapJoinOperator in project hive by apache.

the class SparkMapJoinProcessor method convertMapJoin.

/**
 * convert a regular join to a a map-side join.
 *
 * @param conf
 * @param opParseCtxMap
 * @param op join operator
 * @param joinTree qb join tree
 * @param bigTablePos position of the source to be read as part of
 *                   map-reduce framework. All other sources are cached in memory
 * @param noCheckOuterJoin
 * @param validateMapJoinTree
 */
@Override
public MapJoinOperator convertMapJoin(HiveConf conf, JoinOperator op, boolean leftSrc, String[] baseSrc, List<String> mapAliases, int bigTablePos, boolean noCheckOuterJoin, boolean validateMapJoinTree) throws SemanticException {
    // outer join cannot be performed on a table which is being cached
    JoinCondDesc[] condns = op.getConf().getConds();
    if (!noCheckOuterJoin) {
        if (checkMapJoin(bigTablePos, condns) < 0) {
            throw new SemanticException(ErrorMsg.NO_OUTER_MAPJOIN.getMsg());
        }
    }
    // create the map-join operator
    MapJoinOperator mapJoinOp = convertJoinOpMapJoinOp(conf, op, op.getConf().isLeftInputJoin(), op.getConf().getBaseSrc(), op.getConf().getMapAliases(), bigTablePos, noCheckOuterJoin);
    // 1. remove RS as parent for the big table branch
    // 2. remove old join op from child set of all the RSs
    List<Operator<? extends OperatorDesc>> parentOps = mapJoinOp.getParentOperators();
    for (int i = 0; i < parentOps.size(); i++) {
        Operator<? extends OperatorDesc> parentOp = parentOps.get(i);
        parentOp.getChildOperators().remove(op);
        if (i == bigTablePos) {
            List<Operator<? extends OperatorDesc>> grandParentOps = parentOp.getParentOperators();
            Preconditions.checkArgument(grandParentOps.size() == 1, "AssertionError: expect number of parents to be 1, but was " + grandParentOps.size());
            Operator<? extends OperatorDesc> grandParentOp = grandParentOps.get(0);
            grandParentOp.replaceChild(parentOp, mapJoinOp);
            mapJoinOp.replaceParent(parentOp, grandParentOp);
        }
    }
    return mapJoinOp;
}

Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JoinCondDesc(org.apache.hadoop.hive.ql.plan.JoinCondDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Aggregations

MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)41 Operator (org.apache.hadoop.hive.ql.exec.Operator)22 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)22 ArrayList (java.util.ArrayList)19 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)18 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)15 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 MapJoinDesc (org.apache.hadoop.hive.ql.plan.MapJoinDesc)12 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)11 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)9 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)9 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)9 HashMap (java.util.HashMap)8 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)8 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)8 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)8 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)8 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)8 List (java.util.List)7