Examples with FileSinkOperator - org.apache.hadoop.hive.ql.exec.FileSinkOperator

Example 21 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class GenTezUtils method removeUnionOperators.

// removes any union operator and clones the plan
public static void removeUnionOperators(GenTezProcContext context, BaseWork work, int indexForTezUnion) throws SemanticException {
    List<Operator<?>> roots = new ArrayList<Operator<?>>();
    roots.addAll(work.getAllRootOperators());
    if (work.getDummyOps() != null) {
        roots.addAll(work.getDummyOps());
    }
    roots.addAll(context.eventOperatorSet);
    // need to clone the plan.
    List<Operator<?>> newRoots = SerializationUtilities.cloneOperatorTree(roots, indexForTezUnion);
    // we're cloning the operator plan but we're retaining the original work. That means
    // that root operators have to be replaced with the cloned ops. The replacement map
    // tells you what that mapping is.
    BiMap<Operator<?>, Operator<?>> replacementMap = HashBiMap.create();
    // there's some special handling for dummyOps required. Mapjoins won't be properly
    // initialized if their dummy parents aren't initialized. Since we cloned the plan
    // we need to replace the dummy operators in the work with the cloned ones.
    List<HashTableDummyOperator> dummyOps = new LinkedList<HashTableDummyOperator>();
    Iterator<Operator<?>> it = newRoots.iterator();
    for (Operator<?> orig : roots) {
        Set<FileSinkOperator> fsOpSet = OperatorUtils.findOperators(orig, FileSinkOperator.class);
        for (FileSinkOperator fsOp : fsOpSet) {
            context.fileSinkSet.remove(fsOp);
        }
        Operator<?> newRoot = it.next();
        replacementMap.put(orig, newRoot);
        if (newRoot instanceof HashTableDummyOperator) {
            // dummy ops need to be updated to the cloned ones.
            dummyOps.add((HashTableDummyOperator) newRoot);
            it.remove();
        } else if (newRoot instanceof AppMasterEventOperator) {
            // need to restore the original scan.
            if (newRoot.getConf() instanceof DynamicPruningEventDesc) {
                TableScanOperator ts = ((DynamicPruningEventDesc) orig.getConf()).getTableScan();
                if (ts == null) {
                    throw new AssertionError("No table scan associated with dynamic event pruning. " + orig);
                }
                ((DynamicPruningEventDesc) newRoot.getConf()).setTableScan(ts);
            }
            it.remove();
        } else {
            if (newRoot instanceof TableScanOperator) {
                if (context.tsToEventMap.containsKey(orig)) {
                    // we need to update event operators with the cloned table scan
                    for (AppMasterEventOperator event : context.tsToEventMap.get(orig)) {
                        ((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot);
                    }
                }
                // This TableScanOperator could be part of semijoin optimization.
                Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo = context.parseContext.getRsToSemiJoinBranchInfo();
                for (ReduceSinkOperator rs : rsToSemiJoinBranchInfo.keySet()) {
                    SemiJoinBranchInfo sjInfo = rsToSemiJoinBranchInfo.get(rs);
                    if (sjInfo.getTsOp() == orig) {
                        SemiJoinBranchInfo newSJInfo = new SemiJoinBranchInfo((TableScanOperator) newRoot, sjInfo.getIsHint());
                        rsToSemiJoinBranchInfo.put(rs, newSJInfo);
                    }
                }
            }
            context.rootToWorkMap.remove(orig);
            context.rootToWorkMap.put(newRoot, work);
        }
    }
    // now we remove all the unions. we throw away any branch that's not reachable from
    // the current set of roots. The reason is that those branches will be handled in
    // different tasks.
    Deque<Operator<?>> operators = new LinkedList<Operator<?>>();
    operators.addAll(newRoots);
    Set<Operator<?>> seen = new HashSet<Operator<?>>();
    while (!operators.isEmpty()) {
        Operator<?> current = operators.pop();
        seen.add(current);
        if (current instanceof FileSinkOperator) {
            FileSinkOperator fileSink = (FileSinkOperator) current;
            // remember it for additional processing later
            context.fileSinkSet.add(fileSink);
            FileSinkDesc desc = fileSink.getConf();
            Path path = desc.getDirName();
            List<FileSinkDesc> linked;
            if (!context.linkedFileSinks.containsKey(path)) {
                linked = new ArrayList<FileSinkDesc>();
                context.linkedFileSinks.put(path, linked);
            }
            linked = context.linkedFileSinks.get(path);
            linked.add(desc);
            desc.setDirName(new Path(path, AbstractFileMergeOperator.UNION_SUDBIR_PREFIX + linked.size()));
            Utilities.FILE_OP_LOGGER.debug("removing union - new desc with " + desc.getDirName() + "; parent " + path);
            desc.setLinkedFileSink(true);
            desc.setLinkedFileSinkDesc(linked);
        }
        if (current instanceof AppMasterEventOperator) {
            // remember for additional processing later
            context.eventOperatorSet.add((AppMasterEventOperator) current);
            // mark the original as abandoned. Don't need it anymore.
            context.abandonedEventOperatorSet.add((AppMasterEventOperator) replacementMap.inverse().get(current));
        }
        if (current instanceof UnionOperator) {
            Operator<?> parent = null;
            int count = 0;
            for (Operator<?> op : current.getParentOperators()) {
                if (seen.contains(op)) {
                    ++count;
                    parent = op;
                }
            }
            // we should have been able to reach the union from only one side.
            assert count <= 1;
            if (parent == null) {
                // root operator is union (can happen in reducers)
                replacementMap.put(current, current.getChildOperators().get(0));
            } else {
                parent.removeChildAndAdoptItsChildren(current);
            }
        }
        if (current instanceof FileSinkOperator || current instanceof ReduceSinkOperator) {
            current.setChildOperators(null);
        } else {
            operators.addAll(current.getChildOperators());
        }
    }
    LOG.debug("Setting dummy ops for work " + work.getName() + ": " + dummyOps);
    work.setDummyOps(dummyOps);
    work.replaceRoots(replacementMap);
}

Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) AbstractFileMergeOperator(org.apache.hadoop.hive.ql.exec.AbstractFileMergeOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Path(org.apache.hadoop.fs.Path) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) BiMap(com.google.common.collect.BiMap) HashBiMap(com.google.common.collect.HashBiMap)

Example 22 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class GenSparkUtils method getEdgeProperty.

public static SparkEdgeProperty getEdgeProperty(HiveConf conf, ReduceSinkOperator reduceSink, ReduceWork reduceWork) throws SemanticException {
    boolean useSparkGroupBy = conf.getBoolVar(HiveConf.ConfVars.SPARK_USE_GROUPBY_SHUFFLE);
    SparkEdgeProperty edgeProperty = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE);
    edgeProperty.setNumPartitions(reduceWork.getNumReduceTasks());
    String sortOrder = Strings.nullToEmpty(reduceSink.getConf().getOrder()).trim();
    if (hasGBYOperator(reduceSink)) {
        edgeProperty.setShuffleGroup();
        // SHUFFLE_SORT shouldn't be used for this purpose, see HIVE-8542
        if (!useSparkGroupBy || (!sortOrder.isEmpty() && groupByNeedParLevelOrder(reduceSink))) {
            if (!useSparkGroupBy) {
                LOG.info("hive.spark.use.groupby.shuffle is off. Use repartition shuffle instead.");
            }
            edgeProperty.setMRShuffle();
        }
    }
    if (reduceWork.getReducer() instanceof JoinOperator) {
        // reduce-side join, use MR-style shuffle
        edgeProperty.setMRShuffle();
    }
    // If its a FileSink to bucketed files, also use MR-style shuffle to
    // get compatible taskId for bucket-name
    FileSinkOperator fso = getChildOperator(reduceWork.getReducer(), FileSinkOperator.class);
    if (fso != null) {
        String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
        if (bucketCount != null && Integer.parseInt(bucketCount) > 1) {
            edgeProperty.setMRShuffle();
        }
    }
    // test if we need partition/global order, SHUFFLE_SORT should only be used for global order
    if (edgeProperty.isShuffleNone() && !sortOrder.isEmpty()) {
        if ((reduceSink.getConf().getPartitionCols() == null || reduceSink.getConf().getPartitionCols().isEmpty() || isSame(reduceSink.getConf().getPartitionCols(), reduceSink.getConf().getKeyCols())) && reduceSink.getConf().hasOrderBy()) {
            edgeProperty.setShuffleSort();
        } else {
            edgeProperty.setMRShuffle();
        }
    }
    // simple distribute-by goes here
    if (edgeProperty.isShuffleNone()) {
        if (!useSparkGroupBy) {
            LOG.info("hive.spark.use.groupby.shuffle is off. Use repartition shuffle instead.");
            edgeProperty.setMRShuffle();
        } else {
            edgeProperty.setShuffleGroup();
        }
    }
    return edgeProperty;
}

Also used : SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty)

Example 23 with FileSinkOperator

use of org.apache.hadoop.hive.ql.exec.FileSinkOperator in project hive by apache.

the class TestGenMapRedUtilsCreateConditionalTask method testConditionalMoveOnHdfsIsNotOptimized.

@Test
public void testConditionalMoveOnHdfsIsNotOptimized() throws SemanticException {
    hiveConf.set(HiveConf.ConfVars.HIVE_BLOBSTORE_OPTIMIZATIONS_ENABLED.varname, "true");
    Path sinkDirName = new Path("hdfs://bucket/scratch/-ext-10002");
    FileSinkOperator fileSinkOperator = createFileSinkOperator(sinkDirName);
    Path finalDirName = new Path("hdfs://bucket/scratch/-ext-10000");
    Path tableLocation = new Path("hdfs://bucket/warehouse/table");
    Task<MoveWork> moveTask = createMoveTask(finalDirName, tableLocation);
    List<Task<MoveWork>> moveTaskList = Collections.singletonList(moveTask);
    GenMapRedUtils.createMRWorkForMergingFiles(fileSinkOperator, finalDirName, null, moveTaskList, hiveConf, dummyMRTask, new LineageState());
    ConditionalTask conditionalTask = (ConditionalTask) dummyMRTask.getChildTasks().get(0);
    Task<? extends Serializable> moveOnlyTask = conditionalTask.getListTasks().get(0);
    Task<? extends Serializable> mergeOnlyTask = conditionalTask.getListTasks().get(1);
    Task<? extends Serializable> mergeAndMoveTask = conditionalTask.getListTasks().get(2);
    // Verify moveOnlyTask is NOT optimized
    assertEquals(1, moveOnlyTask.getChildTasks().size());
    verifyMoveTask(moveOnlyTask, sinkDirName, finalDirName);
    verifyMoveTask(moveOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeOnlyTask is NOT optimized
    assertEquals(1, mergeOnlyTask.getChildTasks().size());
    verifyMoveTask(mergeOnlyTask.getChildTasks().get(0), finalDirName, tableLocation);
    // Verify mergeAndMoveTask is NOT optimized
    assertEquals(1, mergeAndMoveTask.getChildTasks().size());
    assertEquals(1, mergeAndMoveTask.getChildTasks().get(0).getChildTasks().size());
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0), sinkDirName, finalDirName);
    verifyMoveTask(mergeAndMoveTask.getChildTasks().get(0).getChildTasks().get(0), finalDirName, tableLocation);
}

Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LineageState(org.apache.hadoop.hive.ql.session.LineageState) Test(org.junit.Test)

Aggregations

FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)23 Path (org.apache.hadoop.fs.Path)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)9 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)9 ArrayList (java.util.ArrayList)8 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)7 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)6 Task (org.apache.hadoop.hive.ql.exec.Task)6 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)6 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)5 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)5 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)5 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)5 Serializable (java.io.Serializable)4 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)4 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)4 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)4 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)4 LinkedHashMap (java.util.LinkedHashMap)3 LinkedList (java.util.LinkedList)3