Search in sources :

Example 41 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method areMergeableExtendedCheck.

private static boolean areMergeableExtendedCheck(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, TableScanOperator tsOp1, TableScanOperator tsOp2) throws SemanticException {
    // If is a DPP, check if actually it refers to same target, column, etc.
    // Further, the DPP value needs to be generated from same subtree
    List<Operator<?>> dppsOp1 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp1));
    List<Operator<?>> dppsOp2 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp2));
    if (dppsOp1.isEmpty() && dppsOp2.isEmpty()) {
        return true;
    }
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> op = dppsOp1.get(i);
        if (op instanceof ReduceSinkOperator) {
            Set<Operator<?>> ascendants = findAscendantWorkOperators(pctx, optimizerCache, op);
            if (ascendants.contains(tsOp2)) {
                // This should not happen, we cannot merge
                return false;
            }
        }
    }
    for (int i = 0; i < dppsOp2.size(); i++) {
        Operator<?> op = dppsOp2.get(i);
        if (op instanceof ReduceSinkOperator) {
            Set<Operator<?>> ascendants = findAscendantWorkOperators(pctx, optimizerCache, op);
            if (ascendants.contains(tsOp1)) {
                // This should not happen, we cannot merge
                return false;
            }
        }
    }
    if (dppsOp1.size() != dppsOp2.size()) {
        // Only first or second operator contains DPP pruning
        return false;
    }
    // Check if DPP branches are equal
    BitSet bs = new BitSet();
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> dppOp1 = dppsOp1.get(i);
        for (int j = 0; j < dppsOp2.size(); j++) {
            if (!bs.get(j)) {
                // If not visited yet
                Operator<?> dppOp2 = dppsOp2.get(j);
                if (compareAndGatherOps(pctx, dppOp1, dppOp2) != null) {
                    // The DPP operator/branch are equal
                    bs.set(j);
                    break;
                }
            }
        }
        if (bs.cardinality() < i + 1) {
            return false;
        }
    }
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet)

Example 42 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method rankTablesByAccumulatedSize.

private static List<Entry<String, Long>> rankTablesByAccumulatedSize(ParseContext pctx) {
    Map<String, Long> tableToTotalSize = new HashMap<>();
    for (Entry<String, TableScanOperator> e : pctx.getTopOps().entrySet()) {
        TableScanOperator tsOp = e.getValue();
        String tableName = tsOp.getTableName().toString();
        long tableSize = tsOp.getStatistics() != null ? tsOp.getStatistics().getDataSize() : 0L;
        Long totalSize = tableToTotalSize.get(tableName);
        if (totalSize != null) {
            tableToTotalSize.put(tableName, StatsUtils.safeAdd(totalSize, tableSize));
        } else {
            tableToTotalSize.put(tableName, tableSize);
        }
    }
    List<Entry<String, Long>> sortedTables = new ArrayList<>(tableToTotalSize.entrySet());
    Collections.sort(sortedTables, Collections.reverseOrder(new Comparator<Map.Entry<String, Long>>() {

        @Override
        public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
            return (o1.getValue()).compareTo(o2.getValue());
        }
    }));
    return sortedTables;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Entry(java.util.Map.Entry) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Map(java.util.Map) IdentityHashMap(java.util.IdentityHashMap) HashMap(java.util.HashMap) Comparator(java.util.Comparator)

Example 43 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method compareOperator.

private static boolean compareOperator(ParseContext pctx, Operator<?> op1, Operator<?> op2) throws SemanticException {
    if (!op1.getClass().getName().equals(op2.getClass().getName())) {
        return false;
    }
    // TODO: move this to logicalEquals
    if (op1 instanceof ReduceSinkOperator) {
        ReduceSinkDesc op1Conf = ((ReduceSinkOperator) op1).getConf();
        ReduceSinkDesc op2Conf = ((ReduceSinkOperator) op2).getConf();
        if (StringUtils.equals(op1Conf.getKeyColString(), op2Conf.getKeyColString()) && StringUtils.equals(op1Conf.getValueColsString(), op2Conf.getValueColsString()) && StringUtils.equals(op1Conf.getParitionColsString(), op2Conf.getParitionColsString()) && op1Conf.getTag() == op2Conf.getTag() && StringUtils.equals(op1Conf.getOrder(), op2Conf.getOrder()) && StringUtils.equals(op1Conf.getNullOrder(), op2Conf.getNullOrder()) && op1Conf.getTopN() == op2Conf.getTopN() && canDeduplicateReduceTraits(op1Conf, op2Conf)) {
            return true;
        } else {
            return false;
        }
    }
    // TODO: move this to logicalEquals
    if (op1 instanceof TableScanOperator) {
        TableScanOperator tsOp1 = (TableScanOperator) op1;
        TableScanOperator tsOp2 = (TableScanOperator) op2;
        TableScanDesc op1Conf = tsOp1.getConf();
        TableScanDesc op2Conf = tsOp2.getConf();
        Table tableMeta1 = op1Conf.getTableMetadata();
        Table tableMeta2 = op2Conf.getTableMetadata();
        if (StringUtils.equals(tableMeta1.getFullyQualifiedName(), tableMeta2.getFullyQualifiedName()) && op1Conf.getNeededColumns().equals(op2Conf.getNeededColumns()) && StringUtils.equals(op1Conf.getFilterExprString(), op2Conf.getFilterExprString()) && pctx.getPrunedPartitions(tsOp1).getPartitions().equals(pctx.getPrunedPartitions(tsOp2).getPartitions()) && op1Conf.getRowLimit() == op2Conf.getRowLimit() && Objects.equals(op1Conf.getIncludedBuckets(), op2Conf.getIncludedBuckets()) && Objects.equals(op1Conf.getOpProps(), op2Conf.getOpProps())) {
            return true;
        } else {
            return false;
        }
    }
    return op1.logicalEquals(op2);
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 44 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class TableSizeBasedBigTableSelectorForAutoSMJ method getBigTablePosition.

public int getBigTablePosition(ParseContext parseCtx, JoinOperator joinOp, Set<Integer> bigTableCandidates) throws SemanticException {
    int bigTablePos = -1;
    long maxSize = -1;
    HiveConf conf = parseCtx.getConf();
    try {
        List<TableScanOperator> topOps = new ArrayList<TableScanOperator>();
        getListTopOps(joinOp, topOps);
        int currentPos = 0;
        for (TableScanOperator topOp : topOps) {
            if (topOp == null) {
                return -1;
            }
            if (!bigTableCandidates.contains(currentPos)) {
                currentPos++;
                continue;
            }
            Table table = topOp.getConf().getTableMetadata();
            long currentSize = 0;
            if (!table.isPartitioned()) {
                currentSize = getSize(conf, table);
            } else {
                // For partitioned tables, get the size of all the partitions
                PrunedPartitionList partsList = PartitionPruner.prune(topOp, parseCtx, null);
                for (Partition part : partsList.getNotDeniedPartns()) {
                    currentSize += getSize(conf, part);
                }
            }
            if (currentSize > maxSize) {
                maxSize = currentSize;
                bigTablePos = currentPos;
            }
            currentPos++;
        }
    } catch (HiveException e) {
        throw new SemanticException(e.getMessage());
    }
    return bigTablePos;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) HiveConf(org.apache.hadoop.hive.conf.HiveConf) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 45 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SharedWorkOptimizer method areMergeableExcludeSemijoinsExtendedCheck.

private static boolean areMergeableExcludeSemijoinsExtendedCheck(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, TableScanOperator tsOp1, TableScanOperator tsOp2) throws SemanticException {
    // We remove RS-based SJs from consideration, then we compare
    List<Operator<?>> dppsOp1 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp1));
    boolean removedDppOp1 = false;
    List<ReduceSinkOperator> rsOpsSemijoin1 = new ArrayList<>();
    List<Operator<?>> dppsOp2 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp2));
    boolean removedDppOp2 = false;
    List<ReduceSinkOperator> rsOpsSemijoin2 = new ArrayList<>();
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> op = dppsOp1.get(i);
        if (op instanceof ReduceSinkOperator) {
            ReduceSinkOperator semijoinRSOp = (ReduceSinkOperator) op;
            if (pctx.getRsToSemiJoinBranchInfo().get(semijoinRSOp).getIsHint()) {
                // This is a hint, we should keep it, hence we bail out
                return false;
            }
            rsOpsSemijoin1.add(semijoinRSOp);
            dppsOp1.remove(i);
            removedDppOp1 = true;
        }
    }
    for (int i = 0; i < dppsOp2.size(); i++) {
        Operator<?> op = dppsOp2.get(i);
        if (op instanceof ReduceSinkOperator) {
            ReduceSinkOperator semijoinRSOp = (ReduceSinkOperator) op;
            if (pctx.getRsToSemiJoinBranchInfo().get(semijoinRSOp).getIsHint()) {
                // This is a hint, we should keep it, hence we bail out
                return false;
            }
            rsOpsSemijoin2.add(semijoinRSOp);
            dppsOp2.remove(i);
            removedDppOp2 = true;
        }
    }
    if (removedDppOp1 && removedDppOp2) {
        // are not targetted by a SJ edge
        return false;
    }
    if (!removedDppOp1 && !removedDppOp2) {
        // None of them are targetted by a SJ, we skip them
        return false;
    }
    if (dppsOp1.size() != dppsOp2.size()) {
        // We cannot merge, we move to the next couple
        return false;
    }
    // Check if DPP branches are equal
    boolean equalBranches = true;
    BitSet bs = new BitSet();
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> dppOp1 = dppsOp1.get(i);
        for (int j = 0; j < dppsOp2.size(); j++) {
            if (!bs.get(j)) {
                // If not visited yet
                Operator<?> dppOp2 = dppsOp2.get(j);
                if (compareAndGatherOps(pctx, dppOp1, dppOp2) != null) {
                    // The DPP operator/branch are equal
                    bs.set(j);
                    break;
                }
            }
        }
        if (bs.cardinality() < i + 1) {
            // We cannot merge, we move to the next group
            equalBranches = false;
            break;
        }
    }
    if (!equalBranches) {
        // Skip
        return false;
    }
    // We reached here, other DPP is the same, these two could potentially be merged.
    // Hence, we perform the last check. To do this, we remove the SJ operators,
    // but we remember their position in the plan. After that, we will reintroduce
    // the SJ operator. If the checks were valid, we will merge and remove the semijoin.
    // If the rest of tests to merge do not pass, we will abort the shared scan optimization
    // and we are done
    TableScanOperator targetTSOp;
    List<ReduceSinkOperator> semijoinRsOps;
    List<SemiJoinBranchInfo> sjBranches = new ArrayList<>();
    if (removedDppOp1) {
        targetTSOp = tsOp1;
        semijoinRsOps = rsOpsSemijoin1;
    } else {
        targetTSOp = tsOp2;
        semijoinRsOps = rsOpsSemijoin2;
    }
    optimizerCache.tableScanToDPPSource.get(targetTSOp).removeAll(semijoinRsOps);
    for (ReduceSinkOperator rsOp : semijoinRsOps) {
        sjBranches.add(pctx.getRsToSemiJoinBranchInfo().remove(rsOp));
    }
    boolean validMerge = validPreConditions(pctx, optimizerCache, extractSharedOptimizationInfoForRoot(pctx, optimizerCache, tsOp1, tsOp2, true, true));
    if (validMerge) {
        // We are going to merge, hence we remove the semijoins completely
        for (ReduceSinkOperator semijoinRsOp : semijoinRsOps) {
            Operator<?> branchOp = GenTezUtils.removeBranch(semijoinRsOp);
            while (branchOp != null) {
                optimizerCache.removeOp(branchOp);
                branchOp = branchOp.getNumChild() > 0 ? branchOp.getChildOperators().get(0) : null;
            }
            GenTezUtils.removeSemiJoinOperator(pctx, semijoinRsOp, targetTSOp);
        }
    } else {
        // Otherwise, the put the semijoins back in the auxiliary data structures
        optimizerCache.tableScanToDPPSource.get(targetTSOp).addAll(semijoinRsOps);
        for (int i = 0; i < semijoinRsOps.size(); i++) {
            pctx.getRsToSemiJoinBranchInfo().put(semijoinRsOps.get(i), sjBranches.get(i));
        }
    }
    return validMerge;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SemiJoinBranchInfo(org.apache.hadoop.hive.ql.parse.SemiJoinBranchInfo) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)133 Operator (org.apache.hadoop.hive.ql.exec.Operator)52 ArrayList (java.util.ArrayList)47 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)44 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)36 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)35 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)32 HashMap (java.util.HashMap)30 Path (org.apache.hadoop.fs.Path)30 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)29 Table (org.apache.hadoop.hive.ql.metadata.Table)26 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)25 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)24 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)24 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)23 LinkedHashMap (java.util.LinkedHashMap)22 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)22 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)22 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)22 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)21