Search in sources :

Example 81 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class TezCompiler method connect.

private void connect(Operator<?> o, AtomicInteger index, Stack<Operator<?>> nodes, Map<Operator<?>, Integer> indexes, Map<Operator<?>, Integer> lowLinks, Set<Set<Operator<?>>> components, ParseContext parseContext) {
    indexes.put(o, index.get());
    lowLinks.put(o, index.get());
    index.incrementAndGet();
    nodes.push(o);
    List<Operator<?>> children;
    if (o instanceof AppMasterEventOperator) {
        children = new ArrayList<Operator<?>>();
        children.addAll(o.getChildOperators());
        TableScanOperator ts = ((DynamicPruningEventDesc) o.getConf()).getTableScan();
        LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
        children.add(ts);
    } else if (o instanceof ReduceSinkOperator) {
        // semijoin case
        children = new ArrayList<Operator<?>>();
        children.addAll(o.getChildOperators());
        SemiJoinBranchInfo sjInfo = parseContext.getRsToSemiJoinBranchInfo().get(o);
        if (sjInfo != null) {
            TableScanOperator ts = sjInfo.getTsOp();
            LOG.debug("Adding special edge: " + o.getName() + " --> " + ts.toString());
            children.add(ts);
        }
    } else {
        children = o.getChildOperators();
    }
    for (Operator<?> child : children) {
        if (!indexes.containsKey(child)) {
            connect(child, index, nodes, indexes, lowLinks, components, parseContext);
            lowLinks.put(o, Math.min(lowLinks.get(o), lowLinks.get(child)));
        } else if (nodes.contains(child)) {
            lowLinks.put(o, Math.min(lowLinks.get(o), indexes.get(child)));
        }
    }
    if (lowLinks.get(o).equals(indexes.get(o))) {
        Set<Operator<?>> component = new LinkedHashSet<Operator<?>>();
        components.add(component);
        Operator<?> current;
        do {
            current = nodes.pop();
            component.add(current);
        } while (current != o);
    }
}
Also used : CommonMergeJoinOperator(org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) LinkedHashSet(java.util.LinkedHashSet) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc)

Example 82 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class GenTezUtils method processDynamicSemiJoinPushDownOperator.

public static void processDynamicSemiJoinPushDownOperator(GenTezProcContext procCtx, RuntimeValuesInfo runtimeValuesInfo, ReduceSinkOperator rs) throws SemanticException {
    SemiJoinBranchInfo sjInfo = procCtx.parseContext.getRsToSemiJoinBranchInfo().get(rs);
    List<BaseWork> rsWorkList = procCtx.childToWorkMap.get(rs);
    if (sjInfo == null || rsWorkList == null) {
        // detection logic. Nothing to do here.
        return;
    }
    if (rsWorkList.size() != 1) {
        StringBuilder sb = new StringBuilder();
        for (BaseWork curWork : rsWorkList) {
            if (sb.length() > 0) {
                sb.append(", ");
            }
            sb.append(curWork.getName());
        }
        throw new SemanticException(rs + " belongs to multiple BaseWorks: " + sb.toString());
    }
    TableScanOperator ts = sjInfo.getTsOp();
    LOG.debug("ResduceSink " + rs + " to TableScan " + ts);
    BaseWork parentWork = rsWorkList.get(0);
    BaseWork childWork = procCtx.rootToWorkMap.get(ts);
    // Connect parent/child work with a brodacast edge.
    LOG.debug("Connecting Baswork - " + parentWork.getName() + " to " + childWork.getName());
    TezEdgeProperty edgeProperty = new TezEdgeProperty(EdgeType.BROADCAST_EDGE);
    TezWork tezWork = procCtx.currentTask.getWork();
    tezWork.connect(parentWork, childWork, edgeProperty);
    // Set output names in ReduceSink
    rs.getConf().setOutputName(childWork.getName());
    // Set up the dynamic values in the childWork.
    RuntimeValuesInfo childRuntimeValuesInfo = new RuntimeValuesInfo();
    childRuntimeValuesInfo.setTableDesc(runtimeValuesInfo.getTableDesc());
    childRuntimeValuesInfo.setDynamicValueIDs(runtimeValuesInfo.getDynamicValueIDs());
    childRuntimeValuesInfo.setColExprs(runtimeValuesInfo.getColExprs());
    childWork.setInputSourceToRuntimeValuesInfo(parentWork.getName(), childRuntimeValuesInfo);
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator)

Example 83 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class GenTezUtils method removeUnionOperators.

// removes any union operator and clones the plan
public static void removeUnionOperators(GenTezProcContext context, BaseWork work, int indexForTezUnion) throws SemanticException {
    List<Operator<?>> roots = new ArrayList<Operator<?>>();
    roots.addAll(work.getAllRootOperators());
    if (work.getDummyOps() != null) {
        roots.addAll(work.getDummyOps());
    }
    roots.addAll(context.eventOperatorSet);
    // need to clone the plan.
    List<Operator<?>> newRoots = SerializationUtilities.cloneOperatorTree(roots, indexForTezUnion);
    // we're cloning the operator plan but we're retaining the original work. That means
    // that root operators have to be replaced with the cloned ops. The replacement map
    // tells you what that mapping is.
    BiMap<Operator<?>, Operator<?>> replacementMap = HashBiMap.create();
    // there's some special handling for dummyOps required. Mapjoins won't be properly
    // initialized if their dummy parents aren't initialized. Since we cloned the plan
    // we need to replace the dummy operators in the work with the cloned ones.
    List<HashTableDummyOperator> dummyOps = new LinkedList<HashTableDummyOperator>();
    Iterator<Operator<?>> it = newRoots.iterator();
    for (Operator<?> orig : roots) {
        Set<FileSinkOperator> fsOpSet = OperatorUtils.findOperators(orig, FileSinkOperator.class);
        for (FileSinkOperator fsOp : fsOpSet) {
            context.fileSinkSet.remove(fsOp);
        }
        Operator<?> newRoot = it.next();
        replacementMap.put(orig, newRoot);
        if (newRoot instanceof HashTableDummyOperator) {
            // dummy ops need to be updated to the cloned ones.
            dummyOps.add((HashTableDummyOperator) newRoot);
            it.remove();
        } else if (newRoot instanceof AppMasterEventOperator) {
            // need to restore the original scan.
            if (newRoot.getConf() instanceof DynamicPruningEventDesc) {
                TableScanOperator ts = ((DynamicPruningEventDesc) orig.getConf()).getTableScan();
                if (ts == null) {
                    throw new AssertionError("No table scan associated with dynamic event pruning. " + orig);
                }
                ((DynamicPruningEventDesc) newRoot.getConf()).setTableScan(ts);
            }
            it.remove();
        } else {
            if (newRoot instanceof TableScanOperator) {
                if (context.tsToEventMap.containsKey(orig)) {
                    // we need to update event operators with the cloned table scan
                    for (AppMasterEventOperator event : context.tsToEventMap.get(orig)) {
                        ((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot);
                    }
                }
                // This TableScanOperator could be part of semijoin optimization.
                Map<ReduceSinkOperator, SemiJoinBranchInfo> rsToSemiJoinBranchInfo = context.parseContext.getRsToSemiJoinBranchInfo();
                for (ReduceSinkOperator rs : rsToSemiJoinBranchInfo.keySet()) {
                    SemiJoinBranchInfo sjInfo = rsToSemiJoinBranchInfo.get(rs);
                    if (sjInfo.getTsOp() == orig) {
                        SemiJoinBranchInfo newSJInfo = new SemiJoinBranchInfo((TableScanOperator) newRoot, sjInfo.getIsHint());
                        rsToSemiJoinBranchInfo.put(rs, newSJInfo);
                    }
                }
            }
            context.rootToWorkMap.remove(orig);
            context.rootToWorkMap.put(newRoot, work);
        }
    }
    // now we remove all the unions. we throw away any branch that's not reachable from
    // the current set of roots. The reason is that those branches will be handled in
    // different tasks.
    Deque<Operator<?>> operators = new LinkedList<Operator<?>>();
    operators.addAll(newRoots);
    Set<Operator<?>> seen = new HashSet<Operator<?>>();
    while (!operators.isEmpty()) {
        Operator<?> current = operators.pop();
        seen.add(current);
        if (current instanceof FileSinkOperator) {
            FileSinkOperator fileSink = (FileSinkOperator) current;
            // remember it for additional processing later
            context.fileSinkSet.add(fileSink);
            FileSinkDesc desc = fileSink.getConf();
            Path path = desc.getDirName();
            List<FileSinkDesc> linked;
            if (!context.linkedFileSinks.containsKey(path)) {
                linked = new ArrayList<FileSinkDesc>();
                context.linkedFileSinks.put(path, linked);
            }
            linked = context.linkedFileSinks.get(path);
            linked.add(desc);
            desc.setDirName(new Path(path, AbstractFileMergeOperator.UNION_SUDBIR_PREFIX + linked.size()));
            Utilities.FILE_OP_LOGGER.debug("removing union - new desc with " + desc.getDirName() + "; parent " + path);
            desc.setLinkedFileSink(true);
            desc.setLinkedFileSinkDesc(linked);
        }
        if (current instanceof AppMasterEventOperator) {
            // remember for additional processing later
            context.eventOperatorSet.add((AppMasterEventOperator) current);
            // mark the original as abandoned. Don't need it anymore.
            context.abandonedEventOperatorSet.add((AppMasterEventOperator) replacementMap.inverse().get(current));
        }
        if (current instanceof UnionOperator) {
            Operator<?> parent = null;
            int count = 0;
            for (Operator<?> op : current.getParentOperators()) {
                if (seen.contains(op)) {
                    ++count;
                    parent = op;
                }
            }
            // we should have been able to reach the union from only one side.
            assert count <= 1;
            if (parent == null) {
                // root operator is union (can happen in reducers)
                replacementMap.put(current, current.getChildOperators().get(0));
            } else {
                parent.removeChildAndAdoptItsChildren(current);
            }
        }
        if (current instanceof FileSinkOperator || current instanceof ReduceSinkOperator) {
            current.setChildOperators(null);
        } else {
            operators.addAll(current.getChildOperators());
        }
    }
    LOG.debug("Setting dummy ops for work " + work.getName() + ": " + dummyOps);
    work.setDummyOps(dummyOps);
    work.replaceRoots(replacementMap);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) AbstractFileMergeOperator(org.apache.hadoop.hive.ql.exec.AbstractFileMergeOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Path(org.apache.hadoop.fs.Path) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) BiMap(com.google.common.collect.BiMap) HashBiMap(com.google.common.collect.HashBiMap)

Example 84 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class ProcessAnalyzeTable method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenTezProcContext context = (GenTezProcContext) procContext;
    TableScanOperator tableScan = (TableScanOperator) nd;
    ParseContext parseContext = context.parseContext;
    Table table = tableScan.getConf().getTableMetadata();
    Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
    if (parseContext.getQueryProperties().isAnalyzeCommand()) {
        assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0;
        String alias = null;
        for (String a : parseContext.getTopOps().keySet()) {
            if (tableScan == parseContext.getTopOps().get(a)) {
                alias = a;
            }
        }
        assert alias != null;
        TezWork tezWork = context.currentTask.getWork();
        if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
            // For ORC & Parquet, all the following statements are the same
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any Tez job above this task
            StatsWork statWork = new StatsWork(table, parseContext.getConf());
            statWork.setFooterScan();
            // If partition is specified, get pruned partition list
            Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            if (confirmedParts.size() > 0) {
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
                statWork.addInputPartitions(partList.getPartitions());
            }
            Task<StatsWork> snjTask = TaskFactory.get(statWork);
            snjTask.setParentTasks(null);
            context.rootTasks.remove(context.currentTask);
            context.rootTasks.add(snjTask);
            return true;
        } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple TezTask followed by a StatsTask.
            // The Tez task is just a simple TableScanOperator
            BasicStatsWork basicStatsWork = new BasicStatsWork(table.getTableSpec());
            basicStatsWork.setNoScanAnalyzeCommand(parseContext.getQueryProperties().isNoScanAnalyzeCommand());
            StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, parseContext.getConf());
            columnStatsWork.collectStatsFromAggregator(tableScan.getConf());
            columnStatsWork.setSourceTask(context.currentTask);
            Task<StatsWork> statsTask = TaskFactory.get(columnStatsWork);
            context.currentTask.addDependentTask(statsTask);
            // The plan consists of a StatsTask only.
            if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
                statsTask.setParentTasks(null);
                context.rootTasks.remove(context.currentTask);
                context.rootTasks.add(statsTask);
            }
            // NOTE: here we should use the new partition predicate pushdown API to
            // get a list of pruned list,
            // and pass it to setTaskPlan as the last parameter
            Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            PrunedPartitionList partitions = null;
            if (confirmedPartns.size() > 0) {
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                partitions = new PrunedPartitionList(table, confirmedPartns, partCols, false);
            }
            MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions);
            w.setGatheringStats(true);
            return true;
        }
    }
    return null;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 85 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class GenSparkUtils method processPartitionPruningSink.

/**
 * Populate partition pruning information from the pruning sink operator to the
 * target MapWork (the MapWork for the big table side). The information include the source table
 * name, column name, and partition key expression. It also set up the temporary path used to
 * communicate between the target MapWork and source BaseWork.
 *
 * Here "source" refers to the small table side, while "target" refers to the big
 * table side.
 *
 * @param context the spark context.
 * @param pruningSink the pruner sink operator being processed.
 */
public void processPartitionPruningSink(GenSparkProcContext context, SparkPartitionPruningSinkOperator pruningSink) {
    SparkPartitionPruningSinkDesc desc = pruningSink.getConf();
    final Path outputBase = getDPPOutputPath(context.parseContext.getContext());
    final String sourceId = pruningSink.getUniqueId();
    desc.setPath(new Path(outputBase, sourceId));
    for (SparkPartitionPruningSinkDesc.DPPTargetInfo targetInfo : desc.getTargetInfos()) {
        TableScanOperator ts = targetInfo.tableScan;
        MapWork targetWork = (MapWork) context.rootToWorkMap.get(ts);
        Preconditions.checkNotNull(targetWork, "No targetWork found for tablescan " + ts);
        // set up temporary path to communicate between the small/big table
        if (targetWork.getTmpPathForPartitionPruning() == null) {
            targetWork.setTmpPathForPartitionPruning(outputBase);
            LOG.info("Setting tmp path between source work and target work:\n" + outputBase);
        }
        targetInfo.work = targetWork;
        targetInfo.columnName = SparkUtilities.getWorkId(targetWork) + ":" + targetInfo.columnName;
        pruningSink.addAsSourceEvent(targetWork, targetInfo.partKey, targetInfo.columnName, targetInfo.columnType);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkPartitionPruningSinkDesc(org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)88 Operator (org.apache.hadoop.hive.ql.exec.Operator)35 ArrayList (java.util.ArrayList)33 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)28 Table (org.apache.hadoop.hive.ql.metadata.Table)21 HashMap (java.util.HashMap)20 Path (org.apache.hadoop.fs.Path)20 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)20 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)19 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)19 LinkedHashMap (java.util.LinkedHashMap)18 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)18 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)18 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)15 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)15 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)15 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)14 Map (java.util.Map)13 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)12 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)12