Search in sources :

Example 1 with LimitOperator

use of org.apache.hadoop.hive.ql.exec.LimitOperator in project hive by apache.

the class SemanticAnalyzer method genLimitPlan.

@SuppressWarnings("nls")
private Operator genLimitPlan(String dest, QB qb, Operator input, int offset, int limit) throws SemanticException {
    // A map-only job can be optimized - instead of converting it to a
    // map-reduce job, we can have another map
    // job to do the same to avoid the cost of sorting in the map-reduce phase.
    // A better approach would be to
    // write into a local file and then have a map-only job.
    // Add the limit operator to get the value fields
    RowResolver inputRR = opParseCtx.get(input).getRowResolver();
    LimitDesc limitDesc = new LimitDesc(offset, limit);
    globalLimitCtx.setLastReduceLimitDesc(limitDesc);
    Operator limitMap = putOpInsertMap(OperatorFactory.getAndMakeChild(limitDesc, new RowSchema(inputRR.getColumnInfos()), input), inputRR);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Created LimitOperator Plan for clause: " + dest + " row schema: " + inputRR.toString());
    }
    return limitMap;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) LimitDesc(org.apache.hadoop.hive.ql.plan.LimitDesc)

Example 2 with LimitOperator

use of org.apache.hadoop.hive.ql.exec.LimitOperator in project hive by apache.

the class GlobalLimitOptimizer method transform.

@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
    Context ctx = pctx.getContext();
    Map<String, TableScanOperator> topOps = pctx.getTopOps();
    GlobalLimitCtx globalLimitCtx = pctx.getGlobalLimitCtx();
    Map<String, SplitSample> nameToSplitSample = pctx.getNameToSplitSample();
    // is used.
    if (topOps.size() == 1 && !globalLimitCtx.ifHasTransformOrUDTF() && nameToSplitSample.isEmpty()) {
        // Here we recursively check:
        // 1. whether there are exact one LIMIT in the query
        // 2. whether there is no aggregation, group-by, distinct, sort by,
        // distributed by, or table sampling in any of the sub-query.
        // The query only qualifies if both conditions are satisfied.
        // 
        // Example qualified queries:
        // CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT ..
        // INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1)
        // FROM ... LIMIT...
        // SELECT * FROM (SELECT col1 as col2 (SELECT * FROM ...) t1 LIMIT ...) t2);
        // 
        TableScanOperator ts = topOps.values().iterator().next();
        Table tab = ts.getConf().getTableMetadata();
        if (tab.isNonNative()) {
            LOG.info("Not enabling limit optimization on non native table: " + tab.getTableName());
            return pctx;
        }
        // InputFormat.getSplits wont be called if no input path & TS Vertex will have 0 task parallelism
        if (tab.getStorageHandler() == null) {
            LimitOperator tempGlobalLimit = checkQbpForGlobalLimit(ts);
            // query qualify for the optimization
            if (tempGlobalLimit != null) {
                LimitDesc tempGlobalLimitDesc = tempGlobalLimit.getConf();
                Set<FilterOperator> filterOps = OperatorUtils.findOperators(ts, FilterOperator.class);
                if (!tab.isPartitioned()) {
                    if (filterOps.size() == 0) {
                        Integer tempOffset = tempGlobalLimitDesc.getOffset();
                        globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
                    }
                } else {
                    // check if the pruner only contains partition columns
                    if (onlyContainsPartnCols(tab, filterOps)) {
                        String alias = (String) topOps.keySet().toArray()[0];
                        PrunedPartitionList partsList = pctx.getPrunedPartitions(alias, ts);
                        // the filter to prune correctly
                        if (!partsList.hasUnknownPartitions()) {
                            Integer tempOffset = tempGlobalLimitDesc.getOffset();
                            globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset);
                        }
                    }
                }
                if (globalLimitCtx.isEnable()) {
                    LOG.info("Qualify the optimize that reduces input size for 'offset' for offset " + globalLimitCtx.getGlobalOffset());
                    LOG.info("Qualify the optimize that reduces input size for 'limit' for limit " + globalLimitCtx.getGlobalLimit());
                }
            }
        }
    }
    return pctx;
}
Also used : Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) SplitSample(org.apache.hadoop.hive.ql.parse.SplitSample) LimitDesc(org.apache.hadoop.hive.ql.plan.LimitDesc) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) GlobalLimitCtx(org.apache.hadoop.hive.ql.parse.GlobalLimitCtx)

Example 3 with LimitOperator

use of org.apache.hadoop.hive.ql.exec.LimitOperator in project hive by apache.

the class SetSparkReducerParallelism method needSetParallelism.

// tests whether the RS needs automatic setting parallelism
private boolean needSetParallelism(ReduceSinkOperator reduceSink, HiveConf hiveConf) {
    ReduceSinkDesc desc = reduceSink.getConf();
    if (desc.getNumReducers() <= 0) {
        return true;
    }
    if (desc.getNumReducers() == 1 && desc.hasOrderBy() && hiveConf.getBoolVar(HiveConf.ConfVars.HIVESAMPLINGFORORDERBY) && !desc.isDeduplicated()) {
        Stack<Operator<? extends OperatorDesc>> descendants = new Stack<Operator<? extends OperatorDesc>>();
        List<Operator<? extends OperatorDesc>> children = reduceSink.getChildOperators();
        if (children != null) {
            for (Operator<? extends OperatorDesc> child : children) {
                descendants.push(child);
            }
        }
        while (descendants.size() != 0) {
            Operator<? extends OperatorDesc> descendant = descendants.pop();
            // If the decendants contains LimitOperator,return false
            if (descendant instanceof LimitOperator) {
                return false;
            }
            boolean reachTerminalOperator = (descendant instanceof TerminalOperator);
            if (!reachTerminalOperator) {
                List<Operator<? extends OperatorDesc>> childrenOfDescendant = descendant.getChildOperators();
                if (childrenOfDescendant != null) {
                    for (Operator<? extends OperatorDesc> childOfDescendant : childrenOfDescendant) {
                        descendants.push(childOfDescendant);
                    }
                }
            }
        }
        return true;
    }
    return false;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) TerminalOperator(org.apache.hadoop.hive.ql.exec.TerminalOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Stack(java.util.Stack)

Example 4 with LimitOperator

use of org.apache.hadoop.hive.ql.exec.LimitOperator in project hive by apache.

the class ConstantPropagateProcCtx method getPropagatedConstants.

/**
 * Get propagated constant map from parents.
 *
 * Traverse all parents of current operator, if there is propagated constant (determined by
 * assignment expression like column=constant value), resolve the column using RowResolver and add
 * it to current constant map.
 *
 * @param op
 *        operator getting the propagated constants.
 * @return map of ColumnInfo to ExprNodeDesc. The values of that map must be either
 *         ExprNodeConstantDesc or ExprNodeNullDesc.
 */
public Map<ColumnInfo, ExprNodeDesc> getPropagatedConstants(Operator<? extends Serializable> op) {
    // this map should map columnInfo to ExprConstantNodeDesc
    Map<ColumnInfo, ExprNodeDesc> constants = new HashMap<ColumnInfo, ExprNodeDesc>();
    if (op.getSchema() == null) {
        return constants;
    }
    RowSchema rs = op.getSchema();
    LOG.debug("Getting constants of op:" + op + " with rs:" + rs);
    if (op.getParentOperators() == null) {
        return constants;
    }
    // A previous solution is based on tableAlias and colAlias, which is
    // unsafe, esp. when CBO generates derived table names. see HIVE-13602.
    // For correctness purpose, we only trust colExpMap.
    // We assume that CBO can do the constantPropagation before this function is
    // called to help improve the performance.
    // UnionOperator, LimitOperator and FilterOperator are special, they should already be
    // column-position aligned.
    List<Map<Integer, ExprNodeDesc>> parentsToConstant = new ArrayList<>();
    boolean areAllParentsContainConstant = true;
    boolean noParentsContainConstant = true;
    for (Operator<?> parent : op.getParentOperators()) {
        Map<ColumnInfo, ExprNodeDesc> constMap = opToConstantExprs.get(parent);
        if (constMap == null) {
            LOG.debug("Constant of Op " + parent.getOperatorId() + " is not found");
            areAllParentsContainConstant = false;
        } else {
            noParentsContainConstant = false;
            Map<Integer, ExprNodeDesc> map = new HashMap<>();
            for (Entry<ColumnInfo, ExprNodeDesc> entry : constMap.entrySet()) {
                map.put(parent.getSchema().getPosition(entry.getKey().getInternalName()), entry.getValue());
            }
            parentsToConstant.add(map);
            LOG.debug("Constant of Op " + parent.getOperatorId() + " " + constMap);
        }
    }
    if (noParentsContainConstant) {
        return constants;
    }
    List<ColumnInfo> signature = op.getSchema().getSignature();
    if (op instanceof LimitOperator || op instanceof FilterOperator) {
        // there should be only one parent.
        if (op.getParentOperators().size() == 1) {
            Map<Integer, ExprNodeDesc> parentToConstant = parentsToConstant.get(0);
            for (int index = 0; index < signature.size(); index++) {
                if (parentToConstant.containsKey(index)) {
                    constants.put(signature.get(index), parentToConstant.get(index));
                }
            }
        }
    } else if (op instanceof UnionOperator && areAllParentsContainConstant) {
        for (int index = 0; index < signature.size(); index++) {
            ExprNodeDesc constant = null;
            for (Map<Integer, ExprNodeDesc> parentToConstant : parentsToConstant) {
                if (!parentToConstant.containsKey(index)) {
                    // if this parent does not contain a constant at this position, we
                    // continue to look at other positions.
                    constant = null;
                    break;
                } else {
                    if (constant == null) {
                        constant = parentToConstant.get(index);
                    } else {
                        // compare if they are the same constant.
                        ExprNodeDesc nextConstant = parentToConstant.get(index);
                        if (!nextConstant.isSame(constant)) {
                            // they are not the same constant. for example, union all of 1
                            // and 2.
                            constant = null;
                            break;
                        }
                    }
                }
            }
            // we have checked all the parents for the "index" position.
            if (constant != null) {
                constants.put(signature.get(index), constant);
            }
        }
    } else if (op instanceof JoinOperator) {
        JoinOperator joinOp = (JoinOperator) op;
        Iterator<Entry<Byte, List<ExprNodeDesc>>> itr = joinOp.getConf().getExprs().entrySet().iterator();
        while (itr.hasNext()) {
            Entry<Byte, List<ExprNodeDesc>> e = itr.next();
            int tag = e.getKey();
            Operator<?> parent = op.getParentOperators().get(tag);
            List<ExprNodeDesc> exprs = e.getValue();
            if (exprs == null) {
                continue;
            }
            for (ExprNodeDesc expr : exprs) {
                // we are only interested in ExprNodeColumnDesc
                if (expr instanceof ExprNodeColumnDesc) {
                    String parentColName = ((ExprNodeColumnDesc) expr).getColumn();
                    // find this parentColName in its parent's rs
                    int parentPos = parent.getSchema().getPosition(parentColName);
                    if (parentsToConstant.get(tag).containsKey(parentPos)) {
                        // reverse look up colExprMap to find the childColName
                        if (op.getColumnExprMap() != null && op.getColumnExprMap().entrySet() != null) {
                            for (Entry<String, ExprNodeDesc> entry : op.getColumnExprMap().entrySet()) {
                                if (entry.getValue().isSame(expr)) {
                                    // now propagate the constant from the parent to the child
                                    constants.put(signature.get(op.getSchema().getPosition(entry.getKey())), parentsToConstant.get(tag).get(parentPos));
                                }
                            }
                        }
                    }
                }
            }
        }
    } else {
        // there should be only one parent.
        if (op.getParentOperators().size() == 1) {
            Operator<?> parent = op.getParentOperators().get(0);
            if (op.getColumnExprMap() != null && op.getColumnExprMap().entrySet() != null) {
                for (Entry<String, ExprNodeDesc> entry : op.getColumnExprMap().entrySet()) {
                    if (op.getSchema().getPosition(entry.getKey()) == -1) {
                        // Not present
                        continue;
                    }
                    ExprNodeDesc expr = entry.getValue();
                    if (expr instanceof ExprNodeColumnDesc) {
                        String parentColName = ((ExprNodeColumnDesc) expr).getColumn();
                        // find this parentColName in its parent's rs
                        int parentPos = parent.getSchema().getPosition(parentColName);
                        if (parentsToConstant.get(0).containsKey(parentPos)) {
                            // this position in parent is a constant
                            // now propagate the constant from the parent to the child
                            constants.put(signature.get(op.getSchema().getPosition(entry.getKey())), parentsToConstant.get(0).get(parentPos));
                        }
                    }
                }
            }
        }
    }
    LOG.debug("Offering constants " + constants.keySet() + " to operator " + op.toString());
    return constants;
}
Also used : JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) Entry(java.util.Map.Entry) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) HashMap(java.util.HashMap) Map(java.util.Map)

Example 5 with LimitOperator

use of org.apache.hadoop.hive.ql.exec.LimitOperator in project hive by apache.

the class GlobalLimitOptimizer method checkQbpForGlobalLimit.

/**
 * Check the limit number in all sub queries
 *
 * @return if there is one and only one limit for all subqueries, return the limit
 *         if there is no limit, return 0
 *         otherwise, return null
 */
private static LimitOperator checkQbpForGlobalLimit(TableScanOperator ts) {
    Set<Class<? extends Operator<?>>> searchedClasses = new ImmutableSet.Builder<Class<? extends Operator<?>>>().add(ReduceSinkOperator.class).add(GroupByOperator.class).add(FilterOperator.class).add(LimitOperator.class).build();
    Multimap<Class<? extends Operator<?>>, Operator<?>> ops = OperatorUtils.classifyOperators(ts, searchedClasses);
    // existsOrdering AND existsPartitioning should be false.
    for (Operator<?> op : ops.get(ReduceSinkOperator.class)) {
        ReduceSinkDesc reduceSinkConf = ((ReduceSinkOperator) op).getConf();
        if (reduceSinkConf.isOrdering() || reduceSinkConf.isPartitioning()) {
            return null;
        }
    }
    // - There cannot exist any (distinct) aggregate.
    for (Operator<?> op : ops.get(GroupByOperator.class)) {
        GroupByDesc groupByConf = ((GroupByOperator) op).getConf();
        if (groupByConf.isAggregate() || groupByConf.isDistinct()) {
            return null;
        }
    }
    // - There cannot exist any sampling predicate.
    for (Operator<?> op : ops.get(FilterOperator.class)) {
        FilterDesc filterConf = ((FilterOperator) op).getConf();
        if (filterConf.getIsSamplingPred()) {
            return null;
        }
    }
    // If there is one and only one limit starting at op, return the limit
    // If there is no limit, return 0
    // Otherwise, return null
    Collection<Operator<?>> limitOps = ops.get(LimitOperator.class);
    if (limitOps.size() == 1) {
        return (LimitOperator) limitOps.iterator().next();
    } else if (limitOps.size() == 0) {
        return null;
    }
    return null;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ImmutableSet(com.google.common.collect.ImmutableSet) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) GroupByDesc(org.apache.hadoop.hive.ql.plan.GroupByDesc)

Aggregations

LimitOperator (org.apache.hadoop.hive.ql.exec.LimitOperator)7 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)6 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)6 Operator (org.apache.hadoop.hive.ql.exec.Operator)5 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)5 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)4 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)4 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)4 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)3 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)3 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)3 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)3 LimitDesc (org.apache.hadoop.hive.ql.plan.LimitDesc)3 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)2 ImmutableSet (com.google.common.collect.ImmutableSet)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 LinkedHashMap (java.util.LinkedHashMap)1 List (java.util.List)1