Search in sources :

Example 31 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class CommonJoinTaskDispatcher method mergeMapJoinTaskIntoItsChildMapRedTask.

/*
   * A task and its child task has been converted from join to mapjoin.
   * See if the two tasks can be merged.
   */
private void mergeMapJoinTaskIntoItsChildMapRedTask(MapRedTask mapJoinTask, Configuration conf) throws SemanticException {
    // If so, check if we can merge mapJoinTask into that child.
    if (mapJoinTask.getChildTasks() == null || mapJoinTask.getChildTasks().size() > 1) {
        // child-tasks in which case we don't want to do anything.
        return;
    }
    Task<?> childTask = mapJoinTask.getChildTasks().get(0);
    if (!(childTask instanceof MapRedTask)) {
        // Nothing to do if it is not a MapReduce task.
        return;
    }
    MapRedTask childMapRedTask = (MapRedTask) childTask;
    MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
    MapWork childMapWork = childMapRedTask.getWork().getMapWork();
    Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork = mapJoinMapWork.getAliasToWork();
    if (mapJoinAliasToWork.size() > 1) {
        // Do not merge if the MapredWork of MapJoin has multiple input aliases.
        return;
    }
    Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry = mapJoinAliasToWork.entrySet().iterator().next();
    String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
    TableScanOperator mapJoinTaskTableScanOperator = OperatorUtils.findSingleOperator(mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
    if (mapJoinTaskTableScanOperator == null) {
        throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + mapJoinAlias + ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    Set<FileSinkOperator> mapJoinTaskFileSinkOperators = OperatorUtils.findOperators(mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperators.isEmpty()) {
        throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() + " operator at the last operator of the MapJoin Task.");
    }
    if (mapJoinTaskFileSinkOperators.size() > 1) {
        LOG.warn("Multiple " + FileSinkOperator.getOperatorName() + " operators found at the last operator of the MapJoin Task.");
        return;
    }
    // The mapJoinTaskFileSinkOperator writes to a different directory
    FileSinkOperator mapJoinTaskFileSinkOperator = mapJoinTaskFileSinkOperators.iterator().next();
    Path childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
        return;
    }
    String childMRAlias = childMRAliases.get(0);
    // Sanity check to make sure there is no alias conflict after merge.
    for (Entry<Path, List<String>> entry : childMapWork.getPathToAliases().entrySet()) {
        Path path = entry.getKey();
        List<String> aliases = entry.getValue();
        if (path.equals(childMRPath)) {
            continue;
        }
        if (aliases.contains(mapJoinAlias)) {
            // alias confict should not happen here.
            return;
        }
    }
    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapRedLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapRedLocalWork();
    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) || (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
        // We should relax this constraint with a follow-up jira.
        return;
    }
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)) {
        // Do not merge.
        return;
    }
    TableScanOperator childMRTaskTableScanOperator = OperatorUtils.findSingleOperator(childMapWork.getAliasToWork().get(childMRAlias.toString()), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
        throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + childMRAlias + ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }
    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask = mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask = childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
        // Do not merge if we do not know how to connect two operator trees.
        return;
    }
    // Step 2: Merge mapJoinTask into the Map-side of its child.
    // Step 2.1: Connect the operator trees of two MapRedTasks.
    Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
    Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
    parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
    childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);
    // Step 2.2: Replace the corresponding part childMRWork's MapWork.
    GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias.toString(), mapJoinMapWork, childMapWork);
    // Step 2.3: Fill up stuff in local work
    if (mapJoinLocalWork != null) {
        if (childLocalWork == null) {
            childMapWork.setMapRedLocalWork(mapJoinLocalWork);
        } else {
            childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
            childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
        }
    }
    // Step 2.4: Remove this MapJoin task
    List<Task<?>> parentTasks = mapJoinTask.getParentTasks();
    mapJoinTask.setParentTasks(null);
    mapJoinTask.setChildTasks(null);
    childMapRedTask.getParentTasks().remove(mapJoinTask);
    if (parentTasks != null) {
        childMapRedTask.getParentTasks().addAll(parentTasks);
        for (Task<?> parentTask : parentTasks) {
            parentTask.getChildTasks().remove(mapJoinTask);
            if (!parentTask.getChildTasks().contains(childMapRedTask)) {
                parentTask.getChildTasks().add(childMapRedTask);
            }
        }
    } else {
        if (physicalContext.getRootTasks().contains(mapJoinTask)) {
            physicalContext.removeFromRootTask(mapJoinTask);
            if (childMapRedTask.getParentTasks() != null && childMapRedTask.getParentTasks().size() == 0 && !physicalContext.getRootTasks().contains(childMapRedTask)) {
                physicalContext.addToRootTask(childMapRedTask);
            }
        }
    }
    if (childMapRedTask.getParentTasks().size() == 0) {
        childMapRedTask.setParentTasks(null);
    }
}
Also used : LateralViewForwardOperator(org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) ArrayList(java.util.ArrayList) List(java.util.List) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 32 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class SetSparkReducerParallelism method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
    ReduceSinkOperator sink = (ReduceSinkOperator) nd;
    ReduceSinkDesc desc = sink.getConf();
    Set<ReduceSinkOperator> parentSinks = null;
    int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
    int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
    if (!useOpStats) {
        parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
        parentSinks.remove(sink);
        if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
            // We haven't processed all the parent sinks, and we need
            // them to be done in order to compute the parallelism for this sink.
            // In this case, skip. We should visit this again from another path.
            LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
            return false;
        }
    }
    if (context.getVisitedReduceSinks().contains(sink)) {
        // skip walking the children
        LOG.debug("Already processed reduce sink: " + sink.getName());
        return true;
    }
    context.getVisitedReduceSinks().add(sink);
    if (needSetParallelism(sink, context.getConf())) {
        if (constantReducers > 0) {
            LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
            desc.setNumReducers(constantReducers);
        } else {
            // If it's a FileSink to bucketed files, use the bucket count as the reducer number
            FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
            if (fso != null) {
                String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
                int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
                if (numBuckets > 0) {
                    LOG.info("Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
                    desc.setNumReducers(numBuckets);
                    return false;
                }
            }
            if (useOpStats || parentSinks.isEmpty()) {
                long numberOfBytes = 0;
                if (useOpStats) {
                    // we need to add up all the estimates from the siblings of this reduce sink
                    for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                        if (sibling.getStatistics() != null) {
                            numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
                            }
                        } else {
                            LOG.warn("No stats available from: " + sibling);
                        }
                    }
                } else {
                    // we should use TS stats to infer parallelism
                    for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                        Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
                        for (TableScanOperator source : sources) {
                            if (source.getStatistics() != null) {
                                numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize());
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
                                }
                            } else {
                                LOG.warn("No stats available from table source: " + source);
                            }
                        }
                    }
                    LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
                }
                // Divide it by 2 so that we can have more reducers
                long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
                int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
                getSparkMemoryAndCores(context);
                if (sparkMemoryAndCores != null && sparkMemoryAndCores.getLeft() > 0 && sparkMemoryAndCores.getRight() > 0) {
                    // warn the user if bytes per reducer is much larger than memory per task
                    if ((double) sparkMemoryAndCores.getLeft() / bytesPerReducer < 0.5) {
                        LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
                    }
                    // If there are more cores, use the number of cores
                    numReducers = Math.max(numReducers, sparkMemoryAndCores.getRight());
                }
                numReducers = Math.min(numReducers, maxReducers);
                LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
                desc.setNumReducers(numReducers);
            } else {
                // Use the maximum parallelism from all parent reduce sinks
                int numberOfReducers = 0;
                for (ReduceSinkOperator parent : parentSinks) {
                    numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
                }
                desc.setNumReducers(numberOfReducers);
                LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
            }
            final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> keyCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
            final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> partCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
            if (keyCols != null && keyCols.equals(partCols)) {
                desc.setReducerTraits(EnumSet.of(UNIFORM));
            }
        }
    } else {
        LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
    }
    return false;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 33 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveOnHdfsIsNotOptimized.

@Test
public void testConditionalMoveOnHdfsIsNotOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
    Path sinkDirName = new Path("hdfs://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("hdfs://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("hdfs://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<?> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<?> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<?> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    // Verify moveOnlyTask is NOT optimized
    assertEquals(1, moveOnlyTask.getChildTasks().size());
    verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
    verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Aggregations

FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)33 ArrayList (java.util.ArrayList)14 Operator (org.apache.hadoop.hive.ql.exec.Operator)13 Path (org.apache.hadoop.fs.Path)12 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)12 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)10 LinkedHashMap (java.util.LinkedHashMap)8 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)8 Task (org.apache.hadoop.hive.ql.exec.Task)8 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)7 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)7 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)6 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)6 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)6 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)6 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)5 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)5 Node (org.apache.hadoop.hive.ql.lib.Node)5 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)5 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)5