Examples with TableScanOperator - org.apache.hadoop.hive.ql.exec.TableScanOperator

Example 41 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method areMergeableExtendedCheck.

private static boolean areMergeableExtendedCheck(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, TableScanOperator tsOp1, TableScanOperator tsOp2) throws SemanticException {
    // If is a DPP, check if actually it refers to same target, column, etc.
    // Further, the DPP value needs to be generated from same subtree
    List<Operator<?>> dppsOp1 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp1));
    List<Operator<?>> dppsOp2 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp2));
    if (dppsOp1.isEmpty() && dppsOp2.isEmpty()) {
        return true;
    }
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> op = dppsOp1.get(i);
        if (op instanceof ReduceSinkOperator) {
            Set<Operator<?>> ascendants = findAscendantWorkOperators(pctx, optimizerCache, op);
            if (ascendants.contains(tsOp2)) {
                // This should not happen, we cannot merge
                return false;
            }
        }
    }
    for (int i = 0; i < dppsOp2.size(); i++) {
        Operator<?> op = dppsOp2.get(i);
        if (op instanceof ReduceSinkOperator) {
            Set<Operator<?>> ascendants = findAscendantWorkOperators(pctx, optimizerCache, op);
            if (ascendants.contains(tsOp1)) {
                // This should not happen, we cannot merge
                return false;
            }
        }
    }
    if (dppsOp1.size() != dppsOp2.size()) {
        // Only first or second operator contains DPP pruning
        return false;
    }
    // Check if DPP branches are equal
    BitSet bs = new BitSet();
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> dppOp1 = dppsOp1.get(i);
        for (int j = 0; j < dppsOp2.size(); j++) {
            if (!bs.get(j)) {
                // If not visited yet
                Operator<?> dppOp2 = dppsOp2.get(j);
                if (compareAndGatherOps(pctx, dppOp1, dppOp2) != null) {
                    // The DPP operator/branch are equal
                    bs.set(j);
                    break;
                }
            }
        }
        if (bs.cardinality() < i + 1) {
            return false;
        }
    }
    return true;
}

Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet)

Example 42 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method rankTablesByAccumulatedSize.

private static List<Entry<String, Long>> rankTablesByAccumulatedSize(ParseContext pctx) {
    Map<String, Long> tableToTotalSize = new HashMap<>();
    for (Entry<String, TableScanOperator> e : pctx.getTopOps().entrySet()) {
        TableScanOperator tsOp = e.getValue();
        String tableName = tsOp.getTableName().toString();
        long tableSize = tsOp.getStatistics() != null ? tsOp.getStatistics().getDataSize() : 0L;
        Long totalSize = tableToTotalSize.get(tableName);
        if (totalSize != null) {
            tableToTotalSize.put(tableName, StatsUtils.safeAdd(totalSize, tableSize));
        } else {
            tableToTotalSize.put(tableName, tableSize);
        }
    }
    List<Entry<String, Long>> sortedTables = new ArrayList<>(tableToTotalSize.entrySet());
    Collections.sort(sortedTables, Collections.reverseOrder(new Comparator<Map.Entry<String, Long>>() {

        @Override
        public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
            return (o1.getValue()).compareTo(o2.getValue());
        }
    }));
    return sortedTables;
}

Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Entry(java.util.Map.Entry) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Map(java.util.Map) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) Comparator(java.util.Comparator)

Example 43 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method compareOperator.

private static boolean compareOperator(ParseContext pctx, Operator<?> op1, Operator<?> op2) throws SemanticException {
    if (!op1.getClass().getName().equals(op2.getClass().getName())) {
        return false;
    }
    // TODO: move this to logicalEquals
    if (op1 instanceof ReduceSinkOperator) {
        ReduceSinkDesc op1Conf = ((ReduceSinkOperator) op1).getConf();
        ReduceSinkDesc op2Conf = ((ReduceSinkOperator) op2).getConf();
        if (StringUtils.equals(op1Conf.getKeyColString(), op2Conf.getKeyColString()) && StringUtils.equals(op1Conf.getValueColsString(), op2Conf.getValueColsString()) && StringUtils.equals(op1Conf.getParitionColsString(), op2Conf.getParitionColsString()) && op1Conf.getTag() == op2Conf.getTag() && StringUtils.equals(op1Conf.getOrder(), op2Conf.getOrder()) && StringUtils.equals(op1Conf.getNullOrder(), op2Conf.getNullOrder()) && op1Conf.getTopN() == op2Conf.getTopN() && canDeduplicateReduceTraits(op1Conf, op2Conf)) {
            return true;
        } else {
            return false;
        }
    }
    // TODO: move this to logicalEquals
    if (op1 instanceof TableScanOperator) {
        TableScanOperator tsOp1 = (TableScanOperator) op1;
        TableScanOperator tsOp2 = (TableScanOperator) op2;
        TableScanDesc op1Conf = tsOp1.getConf();
        TableScanDesc op2Conf = tsOp2.getConf();
        Table tableMeta1 = op1Conf.getTableMetadata();
        Table tableMeta2 = op2Conf.getTableMetadata();
        if (StringUtils.equals(tableMeta1.getFullyQualifiedName(), tableMeta2.getFullyQualifiedName()) && op1Conf.getNeededColumns().equals(op2Conf.getNeededColumns()) && StringUtils.equals(op1Conf.getFilterExprString(), op2Conf.getFilterExprString()) && pctx.getPrunedPartitions(tsOp1).getPartitions().equals(pctx.getPrunedPartitions(tsOp2).getPartitions()) && op1Conf.getRowLimit() == op2Conf.getRowLimit() && Objects.equals(op1Conf.getIncludedBuckets(), op2Conf.getIncludedBuckets()) && Objects.equals(op1Conf.getOpProps(), op2Conf.getOpProps())) {
            return true;
        } else {
            return false;
        }
    }
    return op1.logicalEquals(op2);
}

Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 44 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class TableSizeBasedBigTableSelectorForAutoSMJ method getBigTablePosition.

public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp, Set<Integer> bigTableCandidates) throws SemanticException {
    int bigTablePos = -1;
    long maxSize = -1;
    HiveConf conf = parseCtx.getConf();
    try {
        List<TableScanOperator> topOps = new ArrayList<TableScanOperator>();
        getListTopOps(joinOp, topOps);
        int currentPos = 0;
        for (TableScanOperator topOp : topOps) {
            if (topOp == null) {
                return -1;
            }
            if (!bigTableCandidates.contains(currentPos)) {
                currentPos++;
                continue;
            }
            Table table = topOp.getConf().getTableMetadata();
            long currentSize = 0;
            if (!table.isPartitioned()) {
                currentSize = getSize(conf, table);
            } else {
                // For partitioned tables, get the size of all the partitions
                PrunedPartitionList partsList = PartitionPruner.prune(topOp, parseCtx, null);
                for (Partition part : partsList.getNotDeniedPartns()) {
                    currentSize += getSize(conf, part);
                }
            }
            if (currentSize > maxSize) {
                maxSize = currentSize;
                bigTablePos = currentPos;
            }
            currentPos++;
        }
    } catch (HiveException e) {
        throw new SemanticException(e.getMessage());
    }
    return bigTablePos;
}

Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) HiveConf(org.apache.hadoop.hive.conf.HiveConf) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 45 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method areMergeableExcludeSemijoinsExtendedCheck.

private static boolean areMergeableExcludeSemijoinsExtendedCheck(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, TableScanOperator tsOp1, TableScanOperator tsOp2) throws SemanticException {
    // We remove RS-based SJs from consideration, then we compare
    List<Operator<?>> dppsOp1 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp1));
    boolean removedDppOp1 = false;
    List<ReduceSinkOperator> rsOpsSemijoin1 = new ArrayList<>();
    List<Operator<?>> dppsOp2 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp2));
    boolean removedDppOp2 = false;
    List<ReduceSinkOperator> rsOpsSemijoin2 = new ArrayList<>();
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> op = dppsOp1.get(i);
        if (op instanceof ReduceSinkOperator) {
            ReduceSinkOperator semijoinRSOp = (ReduceSinkOperator) op;
            if (pctx.getRsToSemiJoinBranchInfo().get(semijoinRSOp).getIsHint()) {
                // This is a hint, we should keep it, hence we bail out
                return false;
            }
            rsOpsSemijoin1.add(semijoinRSOp);
            dppsOp1.remove(i);
            removedDppOp1 = true;
        }
    }
    for (int i = 0; i < dppsOp2.size(); i++) {
        Operator<?> op = dppsOp2.get(i);
        if (op instanceof ReduceSinkOperator) {
            ReduceSinkOperator semijoinRSOp = (ReduceSinkOperator) op;
            if (pctx.getRsToSemiJoinBranchInfo().get(semijoinRSOp).getIsHint()) {
                // This is a hint, we should keep it, hence we bail out
                return false;
            }
            rsOpsSemijoin2.add(semijoinRSOp);
            dppsOp2.remove(i);
            removedDppOp2 = true;
        }
    }
    if (removedDppOp1 && removedDppOp2) {
        // are not targetted by a SJ edge
        return false;
    }
    if (!removedDppOp1 && !removedDppOp2) {
        // None of them are targetted by a SJ, we skip them
        return false;
    }
    if (dppsOp1.size() != dppsOp2.size()) {
        // We cannot merge, we move to the next couple
        return false;
    }
    // Check if DPP branches are equal
    boolean equalBranches = true;
    BitSet bs = new BitSet();
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> dppOp1 = dppsOp1.get(i);
        for (int j = 0; j < dppsOp2.size(); j++) {
            if (!bs.get(j)) {
                // If not visited yet
                Operator<?> dppOp2 = dppsOp2.get(j);
                if (compareAndGatherOps(pctx, dppOp1, dppOp2) != null) {
                    // The DPP operator/branch are equal
                    bs.set(j);
                    break;
                }
            }
        }
        if (bs.cardinality() < i + 1) {
            // We cannot merge, we move to the next group
            equalBranches = false;
            break;
        }
    }
    if (!equalBranches) {
        // Skip
        return false;
    }
    // We reached here, other DPP is the same, these two could potentially be merged.
    // Hence, we perform the last check. To do this, we remove the SJ operators,
    // but we remember their position in the plan. After that, we will reintroduce
    // the SJ operator. If the checks were valid, we will merge and remove the semijoin.
    // If the rest of tests to merge do not pass, we will abort the shared scan optimization
    // and we are done
    TableScanOperator targetTSOp;
    List<ReduceSinkOperator> semijoinRsOps;
    List<SemiJoinBranchInfo> sjBranches = new ArrayList<>();
    if (removedDppOp1) {
        targetTSOp = tsOp1;
        semijoinRsOps = rsOpsSemijoin1;
    } else {
        targetTSOp = tsOp2;
        semijoinRsOps = rsOpsSemijoin2;
    }
    optimizerCache.tableScanToDPPSource.get(targetTSOp).removeAll(semijoinRsOps);
    for (ReduceSinkOperator rsOp : semijoinRsOps) {
        sjBranches.add(pctx.getRsToSemiJoinBranchInfo().remove(rsOp));
    }
    boolean validMerge = validPreConditions(pctx, optimizerCache, extractSharedOptimizationInfoForRoot(pctx, optimizerCache, tsOp1, tsOp2, true, true));
    if (validMerge) {
        // We are going to merge, hence we remove the semijoins completely
        for (ReduceSinkOperator semijoinRsOp : semijoinRsOps) {
            Operator<?> branchOp = GenTezUtils.removeBranch(semijoinRsOp);
            while (branchOp != null) {
                optimizerCache.removeOp(branchOp);
                branchOp = branchOp.getNumChild() > 0 ? branchOp.getChildOperators().get(0) : null;
            }
            GenTezUtils.removeSemiJoinOperator(pctx, semijoinRsOp, targetTSOp);
        }
    } else {
        // Otherwise, the put the semijoins back in the auxiliary data structures
        optimizerCache.tableScanToDPPSource.get(targetTSOp).addAll(semijoinRsOps);
        for (int i = 0; i < semijoinRsOps.size(); i++) {
            pctx.getRsToSemiJoinBranchInfo().put(semijoinRsOps.get(i), sjBranches.get(i));
        }
    }
    return validMerge;
}

Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)133 Operator (org.apache.hadoop.hive.ql.exec.Operator)52 ArrayList (java.util.ArrayList)47 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)44 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)36 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)35 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)32 HashMap (java.util.HashMap)30 Path (org.apache.hadoop.fs.Path)30 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)29 Table (org.apache.hadoop.hive.ql.metadata.Table)26 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)25 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)24 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)24 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)23 LinkedHashMap (java.util.LinkedHashMap)22 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)22 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)22 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)22 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)21