Search in sources :

Example 36 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class TestGenTezWork method setUp.

/**
 * @throws java.lang.Exception
 */
@SuppressWarnings("unchecked")
@Before
public void setUp() throws Exception {
    // Init conf
    final HiveConf conf = new HiveConf(SemanticAnalyzer.class);
    SessionState.start(conf);
    // Init parse context
    final ParseContext pctx = new ParseContext();
    pctx.setContext(new Context(conf));
    ctx = new GenTezProcContext(conf, pctx, Collections.EMPTY_LIST, new ArrayList<Task<?>>(), Collections.EMPTY_SET, Collections.EMPTY_SET);
    proc = new GenTezWork(new GenTezUtils() {

        @Override
        protected void setupMapWork(MapWork mapWork, GenTezProcContext context, PrunedPartitionList partitions, TableScanOperator root, String alias) throws SemanticException {
            LinkedHashMap<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
            map.put("foo", root);
            mapWork.setAliasToWork(map);
            return;
        }
    });
    CompilationOpContext cCtx = new CompilationOpContext();
    fs = new FileSinkOperator(cCtx);
    fs.setConf(new FileSinkDesc());
    rs = new ReduceSinkOperator(cCtx);
    rs.setConf(new ReduceSinkDesc());
    TableDesc tableDesc = new TableDesc();
    tableDesc.setProperties(new Properties());
    rs.getConf().setKeySerializeInfo(tableDesc);
    ts = new TableScanOperator(cCtx);
    ts.setConf(new TableScanDesc(null));
    ts.getChildOperators().add(rs);
    rs.getParentOperators().add(ts);
    rs.getChildOperators().add(fs);
    fs.getParentOperators().add(rs);
    ctx.preceedingWork = null;
    ctx.currentRootOperator = ts;
}
Also used : Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) ArrayList(java.util.ArrayList) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) HiveConf(org.apache.hadoop.hive.conf.HiveConf) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Before(org.junit.Before)

Example 37 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class TestSharedWorkOptimizer method getTsOp.

private TableScanOperator getTsOp() {
    Table tblMetadata = new Table("db", "table");
    TableScanDesc desc = new TableScanDesc("alias_" + cCtx.nextOperatorId(), tblMetadata);
    Operator<TableScanDesc> ts = OperatorFactory.get(cCtx, desc);
    return (TableScanOperator) ts;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc)

Example 38 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class TestSharedWorkOptimizer method testTSCmpOrdersById.

@Test
public void testTSCmpOrdersById() {
    TableScanOperator ts1 = getTsOp();
    TableScanOperator ts2 = getTsOp();
    ArrayList<TableScanOperator> li1 = Lists.newArrayList(ts1, ts2);
    ArrayList<TableScanOperator> li2 = Lists.newArrayList(ts2, ts1);
    li1.sort(new TSComparator());
    li2.sort(new TSComparator());
    assertTrue(li1.get(0) == li2.get(0));
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) TSComparator(org.apache.hadoop.hive.ql.optimizer.SharedWorkOptimizer.TSComparator) Test(org.junit.Test)

Example 39 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class TestNullScanTaskDispatcher method verifyNumberOfReads.

private void verifyNumberOfReads(int externalPartitionCount, int managedPartitionCount, int upToDateManagedPartitions, int expectedReadOps) throws IOException, SemanticException {
    final String MANAGED_TEST_TABLE = "managedTestTable";
    final String EXTERNAL_TEST_TABLE = "externalTestTable";
    createTable(EXTERNAL_TEST_TABLE, externalPartitionCount, 0);
    createTable(MANAGED_TEST_TABLE, managedPartitionCount, upToDateManagedPartitions);
    // operator setup
    TableScanOperator tsoManaged = createTableScanOperator(false);
    TableScanOperator tsoExternal = createTableScanOperator(true);
    aliasToWork.put(MANAGED_TEST_TABLE, tsoManaged);
    aliasToWork.put(EXTERNAL_TEST_TABLE, tsoExternal);
    PhysicalContext physicalContext = new PhysicalContext(hiveConf, parseContext, context, getAsRootTaskList(mapWork, reduceWork), null);
    new MetadataOnlyOptimizer().resolve(physicalContext);
    assertEquals(1, mapWork.getPathToPartitionInfo().size());
    StorageStatistics statistics = FileSystem.getGlobalStorageStatistics().get("mock");
    assertEquals(expectedReadOps, (long) statistics.getLong("readOps"));
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) StorageStatistics(org.apache.hadoop.fs.StorageStatistics)

Example 40 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method sharedWorkExtendedOptimization.

private static void sharedWorkExtendedOptimization(ParseContext pctx, SharedWorkOptimizerCache optimizerCache) throws SemanticException {
    // Gather RS operators that 1) belong to root works, i.e., works containing TS operators,
    // and 2) share the same input operator.
    // These will be the first target for extended shared work optimization
    Multimap<Operator<?>, ReduceSinkOperator> parentToRsOps = ArrayListMultimap.create();
    Set<Operator<?>> visited = new HashSet<>();
    for (Entry<String, TableScanOperator> e : pctx.getTopOps().entrySet()) {
        gatherReduceSinkOpsByInput(parentToRsOps, visited, findWorkOperators(optimizerCache, e.getValue()));
    }
    Set<Operator<?>> removedOps = new HashSet<>();
    while (!parentToRsOps.isEmpty()) {
        // As above, we enforce a certain order when we do the reutilization.
        // In particular, we use size of data in RS x number of uses.
        List<Entry<Operator<?>, Long>> sortedRSGroups = rankOpsByAccumulatedSize(parentToRsOps.keySet());
        LOG.debug("Sorted operators by size: {}", sortedRSGroups);
        // Execute extended optimization
        // For each RS, check whether other RS in same work could be merge into this one.
        // If they are merged, RS operators in the resulting work will be considered
        // mergeable in next loop iteration.
        Multimap<Operator<?>, ReduceSinkOperator> existingRsOps = ArrayListMultimap.create();
        for (Entry<Operator<?>, Long> rsGroupInfo : sortedRSGroups) {
            Operator<?> rsParent = rsGroupInfo.getKey();
            for (ReduceSinkOperator discardableRsOp : parentToRsOps.get(rsParent)) {
                if (removedOps.contains(discardableRsOp)) {
                    LOG.debug("Skip {} as it has already been removed", discardableRsOp);
                    continue;
                }
                Collection<ReduceSinkOperator> otherRsOps = existingRsOps.get(rsParent);
                for (ReduceSinkOperator retainableRsOp : otherRsOps) {
                    if (retainableRsOp.getChildOperators().size() == 0) {
                        // just skip this RS - its a semijoin/bloomfilter related RS
                        continue;
                    }
                    if (removedOps.contains(retainableRsOp)) {
                        LOG.debug("Skip {} as it has already been removed", retainableRsOp);
                        continue;
                    }
                    // First we quickly check if the two RS operators can actually be merged.
                    // We already know that these two RS operators have the same parent, but
                    // we need to check whether both RS are actually equal. Further, we check
                    // whether their child is also equal. If any of these conditions are not
                    // met, we are not going to try to merge.
                    boolean mergeable = compareOperator(pctx, retainableRsOp, discardableRsOp) && compareOperator(pctx, retainableRsOp.getChildOperators().get(0), discardableRsOp.getChildOperators().get(0));
                    if (!mergeable) {
                        // Skip
                        LOG.debug("{} and {} cannot be merged", retainableRsOp, discardableRsOp);
                        continue;
                    }
                    LOG.debug("Checking additional conditions for merging subtree starting at {}" + " into subtree starting at {}", discardableRsOp, retainableRsOp);
                    // Secondly, we extract information about the part of the tree that can be merged
                    // as well as some structural information (memory consumption) that needs to be
                    // used to determined whether the merge can happen
                    Operator<?> retainableRsOpChild = retainableRsOp.getChildOperators().get(0);
                    Operator<?> discardableRsOpChild = discardableRsOp.getChildOperators().get(0);
                    SharedResult sr = extractSharedOptimizationInfo(pctx, optimizerCache, retainableRsOp, discardableRsOp, retainableRsOpChild, discardableRsOpChild);
                    // tables.
                    if (sr.retainableOps.isEmpty() || !validPreConditions(pctx, optimizerCache, sr)) {
                        // Skip
                        LOG.debug("{} and {} do not meet preconditions", retainableRsOp, discardableRsOp);
                        continue;
                    }
                    deduplicateReduceTraits(retainableRsOp.getConf(), discardableRsOp.getConf());
                    // We can merge
                    Operator<?> lastRetainableOp = sr.retainableOps.get(sr.retainableOps.size() - 1);
                    Operator<?> lastDiscardableOp = sr.discardableOps.get(sr.discardableOps.size() - 1);
                    if (lastDiscardableOp.getNumChild() != 0) {
                        List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(lastDiscardableOp.getChildOperators());
                        for (Operator<? extends OperatorDesc> op : allChildren) {
                            lastDiscardableOp.getChildOperators().remove(op);
                            op.replaceParent(lastDiscardableOp, lastRetainableOp);
                            lastRetainableOp.getChildOperators().add(op);
                        }
                    }
                    LOG.debug("Merging subtree starting at {} into subtree starting at {}", discardableRsOp, retainableRsOp);
                    // we are going to eliminate
                    for (Operator<?> op : sr.discardableInputOps) {
                        OperatorUtils.removeOperator(op);
                        optimizerCache.removeOp(op);
                        removedOps.add(op);
                        // Remove DPP predicates
                        if (op instanceof ReduceSinkOperator) {
                            SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
                            if (sjbi != null && !sr.discardableOps.contains(sjbi.getTsOp()) && !sr.discardableInputOps.contains(sjbi.getTsOp())) {
                                GenTezUtils.removeSemiJoinOperator(pctx, (ReduceSinkOperator) op, sjbi.getTsOp());
                                optimizerCache.tableScanToDPPSource.remove(sjbi.getTsOp(), op);
                            }
                        } else if (op instanceof AppMasterEventOperator) {
                            DynamicPruningEventDesc dped = (DynamicPruningEventDesc) op.getConf();
                            if (!sr.discardableOps.contains(dped.getTableScan()) && !sr.discardableInputOps.contains(dped.getTableScan())) {
                                GenTezUtils.removeSemiJoinOperator(pctx, (AppMasterEventOperator) op, dped.getTableScan());
                                optimizerCache.tableScanToDPPSource.remove(dped.getTableScan(), op);
                            }
                        }
                        LOG.debug("Input operator removed: {}", op);
                    }
                    // We remove the discardable RS operator
                    OperatorUtils.removeOperator(discardableRsOp);
                    optimizerCache.removeOp(discardableRsOp);
                    removedOps.add(discardableRsOp);
                    LOG.debug("Operator removed: {}", discardableRsOp);
                    // Then we merge the operators of the works we are going to merge
                    optimizerCache.removeOpAndCombineWork(discardableRsOpChild, retainableRsOpChild);
                    // Finally we remove the rest of the expression from the tree
                    for (Operator<?> op : sr.discardableOps) {
                        OperatorUtils.removeOperator(op);
                        optimizerCache.removeOp(op);
                        removedOps.add(op);
                        LOG.debug("Operator removed: {}", op);
                    }
                    if (pctx.getConf().getBoolVar(ConfVars.HIVE_SHARED_WORK_DOWNSTREAM_MERGE)) {
                        if (sr.discardableOps.size() == 1) {
                            downStreamMerge(retainableRsOp, optimizerCache, pctx);
                        }
                    }
                    break;
                }
                if (removedOps.contains(discardableRsOp)) {
                    // This operator has been removed, remove it from the list of existing operators
                    existingRsOps.remove(rsParent, discardableRsOp);
                } else {
                    // This operator has not been removed, include it in the list of existing operators
                    existingRsOps.put(rsParent, discardableRsOp);
                }
            }
        }
        // We gather the operators that will be used for next iteration of extended optimization
        // (if any)
        parentToRsOps = ArrayListMultimap.create();
        visited = new HashSet<>();
        for (Entry<Operator<?>, ReduceSinkOperator> e : existingRsOps.entries()) {
            if (removedOps.contains(e.getValue()) || e.getValue().getNumChild() < 1) {
                // semijoin RS), we can quickly skip this one
                continue;
            }
            gatherReduceSinkOpsByInput(parentToRsOps, visited, findWorkOperators(optimizerCache, e.getValue().getChildOperators().get(0)));
        }
    }
    // Remove unused table scan operators
    pctx.getTopOps().entrySet().removeIf((Entry<String, TableScanOperator> e) -> e.getValue().getNumChild() == 0);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) DynamicPruningEventDesc(org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc) Entry(java.util.Map.Entry) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)133 Operator (org.apache.hadoop.hive.ql.exec.Operator)52 ArrayList (java.util.ArrayList)47 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)44 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)36 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)35 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)32 HashMap (java.util.HashMap)30 Path (org.apache.hadoop.fs.Path)30 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)29 Table (org.apache.hadoop.hive.ql.metadata.Table)26 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)25 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)24 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)24 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)23 LinkedHashMap (java.util.LinkedHashMap)22 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)22 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)22 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)22 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)21