Search in sources :

Example 71 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SparkUtilities method removeNestedDPP.

/**
 * For DPP sinks w/ common join, we'll split the tree and what's above the branching
 * operator is computed multiple times. Therefore it may not be good for performance to support
 * nested DPP sinks, i.e. one DPP sink depends on other DPP sinks.
 * The following is an example:
 *
 *             TS          TS
 *             |           |
 *            ...         FIL
 *            |           |  \
 *            RS         RS  SEL
 *              \        /    |
 *     TS          JOIN      GBY
 *     |         /     \      |
 *    RS        RS    SEL   DPP2
 *     \       /       |
 *       JOIN         GBY
 *                     |
 *                    DPP1
 *
 * where DPP1 depends on DPP2.
 *
 * To avoid such case, we'll visit all the branching operators. If a branching operator has any
 * further away DPP branches w/ common join in its sub-tree, such branches will be removed.
 * In the above example, the branch of DPP1 will be removed.
 */
public static void removeNestedDPP(OptimizeSparkProcContext procContext) {
    Set<SparkPartitionPruningSinkOperator> allDPPs = new HashSet<>();
    Set<Operator<?>> seen = new HashSet<>();
    // collect all DPP sinks
    for (TableScanOperator root : procContext.getParseContext().getTopOps().values()) {
        SparkUtilities.collectOp(root, SparkPartitionPruningSinkOperator.class, allDPPs, seen);
    }
    // collect all branching operators
    Set<Operator<?>> branchingOps = new HashSet<>();
    for (SparkPartitionPruningSinkOperator dpp : allDPPs) {
        branchingOps.add(dpp.getBranchingOp());
    }
    // remember the branching ops we have visited
    Set<Operator<?>> visited = new HashSet<>();
    for (Operator<?> branchingOp : branchingOps) {
        if (!visited.contains(branchingOp)) {
            visited.add(branchingOp);
            seen.clear();
            Set<SparkPartitionPruningSinkOperator> nestedDPPs = new HashSet<>();
            for (Operator<?> branch : branchingOp.getChildOperators()) {
                if (!isDirectDPPBranch(branch)) {
                    SparkUtilities.collectOp(branch, SparkPartitionPruningSinkOperator.class, nestedDPPs, seen);
                }
            }
            for (SparkPartitionPruningSinkOperator nestedDPP : nestedDPPs) {
                visited.add(nestedDPP.getBranchingOp());
                // if a DPP is with MJ, the tree won't be split and so we don't have to remove it
                if (!nestedDPP.isWithMapjoin()) {
                    OperatorUtils.removeBranch(nestedDPP);
                }
            }
        }
    }
}
Also used : SparkPartitionPruningSinkOperator(org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SparkPartitionPruningSinkOperator(org.apache.hadoop.hive.ql.parse.spark.SparkPartitionPruningSinkOperator) HashSet(java.util.HashSet)

Example 72 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class Driver method recordValidWriteIds.

// Write the current set of valid write ids for the operated acid tables into the conf file so
// that it can be read by the input format.
private void recordValidWriteIds(HiveTxnManager txnMgr) throws LockException {
    String txnString = conf.get(ValidTxnList.VALID_TXNS_KEY);
    if ((txnString == null) || (txnString.isEmpty())) {
        throw new IllegalStateException("calling recordValidWritsIdss() without initializing ValidTxnList " + JavaUtils.txnIdToString(txnMgr.getCurrentTxnId()));
    }
    ValidTxnWriteIdList txnWriteIds = txnMgr.getValidWriteIds(getTransactionalTableList(plan), txnString);
    String writeIdStr = txnWriteIds.toString();
    conf.set(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY, writeIdStr);
    if (plan.getFetchTask() != null) {
        /**
         * This is needed for {@link HiveConf.ConfVars.HIVEFETCHTASKCONVERSION} optimization which
         * initializes JobConf in FetchOperator before recordValidTxns() but this has to be done
         * after locks are acquired to avoid race conditions in ACID.
         * This case is supported only for single source query.
         */
        Operator<?> source = plan.getFetchTask().getWork().getSource();
        if (source instanceof TableScanOperator) {
            TableScanOperator tsOp = (TableScanOperator) source;
            String fullTableName = AcidUtils.getFullTableName(tsOp.getConf().getDatabaseName(), tsOp.getConf().getTableName());
            ValidWriteIdList writeIdList = txnWriteIds.getTableValidWriteIdList(fullTableName);
            if (tsOp.getConf().isTranscationalTable() && (writeIdList == null)) {
                throw new IllegalStateException("ACID table: " + fullTableName + " is missing from the ValidWriteIdList config: " + writeIdStr);
            }
            if (writeIdList != null) {
                plan.getFetchTask().setValidWriteIdList(writeIdList.toString());
            }
        }
    }
    LOG.debug("Encoding valid txn write ids info " + writeIdStr + " txnid:" + txnMgr.getCurrentTxnId());
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) ValidTxnWriteIdList(org.apache.hadoop.hive.common.ValidTxnWriteIdList)

Example 73 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SemanticAnalyzer method genTablePlan.

@SuppressWarnings("nls")
private Operator genTablePlan(String alias, QB qb) throws SemanticException {
    String alias_id = getAliasId(alias, qb);
    Table tab = qb.getMetaData().getSrcForAlias(alias);
    RowResolver rwsch;
    // is the table already present
    TableScanOperator top = topOps.get(alias_id);
    // Obtain table props in query
    Map<String, String> properties = qb.getTabPropsForAlias(alias);
    if (top == null) {
        // Determine row schema for TSOP.
        // Include column names from SerDe, the partition and virtual columns.
        rwsch = new RowResolver();
        try {
            // Including parameters passed in the query
            if (properties != null) {
                for (Entry<String, String> prop : properties.entrySet()) {
                    if (tab.getSerdeParam(prop.getKey()) != null) {
                        LOG.warn("SerDe property in input query overrides stored SerDe property");
                    }
                    tab.setSerdeParam(prop.getKey(), prop.getValue());
                }
            }
            // Obtain inspector for schema
            StructObjectInspector rowObjectInspector = (StructObjectInspector) tab.getDeserializer().getObjectInspector();
            List<? extends StructField> fields = rowObjectInspector.getAllStructFieldRefs();
            for (int i = 0; i < fields.size(); i++) {
                /**
                 * if the column is a skewed column, use ColumnInfo accordingly
                 */
                ColumnInfo colInfo = new ColumnInfo(fields.get(i).getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(fields.get(i).getFieldObjectInspector()), alias, false);
                colInfo.setSkewedCol((isSkewedCol(alias, qb, fields.get(i).getFieldName())) ? true : false);
                rwsch.put(alias, fields.get(i).getFieldName(), colInfo);
            }
        } catch (SerDeException e) {
            throw new RuntimeException(e);
        }
        // Finally add the partitioning columns
        for (FieldSchema part_col : tab.getPartCols()) {
            LOG.trace("Adding partition col: " + part_col);
            rwsch.put(alias, part_col.getName(), new ColumnInfo(part_col.getName(), TypeInfoFactory.getPrimitiveTypeInfo(part_col.getType()), alias, true));
        }
        // put all virtual columns in RowResolver.
        Iterator<VirtualColumn> vcs = VirtualColumn.getRegistry(conf).iterator();
        // use a list for easy cumtomize
        List<VirtualColumn> vcList = new ArrayList<VirtualColumn>();
        while (vcs.hasNext()) {
            VirtualColumn vc = vcs.next();
            rwsch.put(alias, vc.getName().toLowerCase(), new ColumnInfo(vc.getName(), vc.getTypeInfo(), alias, true, vc.getIsHidden()));
            vcList.add(vc);
        }
        // Create the root of the operator tree
        TableScanDesc tsDesc = new TableScanDesc(alias, vcList, tab);
        setupStats(tsDesc, qb.getParseInfo(), tab, alias, rwsch);
        SplitSample sample = nameToSplitSample.get(alias_id);
        if (sample != null && sample.getRowCount() != null) {
            tsDesc.setRowLimit(sample.getRowCount());
            nameToSplitSample.remove(alias_id);
        }
        top = (TableScanOperator) putOpInsertMap(OperatorFactory.get(getOpContext(), tsDesc, new RowSchema(rwsch.getColumnInfos())), rwsch);
        // Set insiderView so that we can skip the column authorization for this.
        top.setInsideView(qb.isInsideView() || qb.getAliasInsideView().contains(alias.toLowerCase()));
        // Add this to the list of top operators - we always start from a table
        // scan
        topOps.put(alias_id, top);
        // Add a mapping from the table scan operator to Table
        topToTable.put(top, tab);
        if (properties != null) {
            topToTableProps.put(top, properties);
            tsDesc.setOpProps(properties);
        }
    } else {
        rwsch = opParseCtx.get(top).getRowResolver();
        top.setChildOperators(null);
    }
    // check if this table is sampled and needs more than input pruning
    Operator<? extends OperatorDesc> op = top;
    TableSample ts = qb.getParseInfo().getTabSample(alias);
    if (ts != null) {
        TableScanOperator tableScanOp = top;
        tableScanOp.getConf().setTableSample(ts);
        int num = ts.getNumerator();
        int den = ts.getDenominator();
        ArrayList<ASTNode> sampleExprs = ts.getExprs();
        // TODO: Do the type checking of the expressions
        List<String> tabBucketCols = tab.getBucketCols();
        int numBuckets = tab.getNumBuckets();
        // If there are no sample cols and no bucket cols then throw an error
        if (tabBucketCols.size() == 0 && sampleExprs.size() == 0) {
            throw new SemanticException(ErrorMsg.NON_BUCKETED_TABLE.getMsg() + " " + tab.getTableName());
        }
        if (num > den) {
            throw new SemanticException(ErrorMsg.BUCKETED_NUMERATOR_BIGGER_DENOMINATOR.getMsg() + " " + tab.getTableName());
        }
        // check if a predicate is needed
        // predicate is needed if either input pruning is not enough
        // or if input pruning is not possible
        // check if the sample columns are the same as the table bucket columns
        boolean colsEqual = true;
        if ((sampleExprs.size() != tabBucketCols.size()) && (sampleExprs.size() != 0)) {
            colsEqual = false;
        }
        for (int i = 0; i < sampleExprs.size() && colsEqual; i++) {
            boolean colFound = false;
            for (int j = 0; j < tabBucketCols.size() && !colFound; j++) {
                if (sampleExprs.get(i).getToken().getType() != HiveParser.TOK_TABLE_OR_COL) {
                    break;
                }
                if (((ASTNode) sampleExprs.get(i).getChild(0)).getText().equalsIgnoreCase(tabBucketCols.get(j))) {
                    colFound = true;
                }
            }
            colsEqual = (colsEqual && colFound);
        }
        // Check if input can be pruned
        ts.setInputPruning((sampleExprs == null || sampleExprs.size() == 0 || colsEqual));
        // check if input pruning is enough
        if ((sampleExprs == null || sampleExprs.size() == 0 || colsEqual) && (num == den || (den % numBuckets == 0 || numBuckets % den == 0))) {
            // input pruning is enough; add the filter for the optimizer to use it
            // later
            LOG.info("No need for sample filter");
            ExprNodeDesc samplePredicate = genSamplePredicate(ts, tabBucketCols, colsEqual, alias, rwsch, qb.getMetaData(), null);
            FilterDesc filterDesc = new FilterDesc(samplePredicate, true, new SampleDesc(ts.getNumerator(), ts.getDenominator(), tabBucketCols, true));
            filterDesc.setGenerated(true);
            op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
        } else {
            // need to add filter
            // create tableOp to be filterDesc and set as child to 'top'
            LOG.info("Need sample filter");
            ExprNodeDesc samplePredicate = genSamplePredicate(ts, tabBucketCols, colsEqual, alias, rwsch, qb.getMetaData(), null);
            FilterDesc filterDesc = new FilterDesc(samplePredicate, true);
            filterDesc.setGenerated(true);
            op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
        }
    } else {
        boolean testMode = conf.getBoolVar(HiveConf.ConfVars.HIVETESTMODE);
        if (testMode) {
            String tabName = tab.getTableName();
            // has the user explicitly asked not to sample this table
            String unSampleTblList = conf.getVar(HiveConf.ConfVars.HIVETESTMODENOSAMPLE);
            String[] unSampleTbls = unSampleTblList.split(",");
            boolean unsample = false;
            for (String unSampleTbl : unSampleTbls) {
                if (tabName.equalsIgnoreCase(unSampleTbl)) {
                    unsample = true;
                }
            }
            if (!unsample) {
                int numBuckets = tab.getNumBuckets();
                // If the input table is bucketed, choose the first bucket
                if (numBuckets > 0) {
                    TableSample tsSample = new TableSample(1, numBuckets);
                    tsSample.setInputPruning(true);
                    qb.getParseInfo().setTabSample(alias, tsSample);
                    ExprNodeDesc samplePred = genSamplePredicate(tsSample, tab.getBucketCols(), true, alias, rwsch, qb.getMetaData(), null);
                    FilterDesc filterDesc = new FilterDesc(samplePred, true, new SampleDesc(tsSample.getNumerator(), tsSample.getDenominator(), tab.getBucketCols(), true));
                    filterDesc.setGenerated(true);
                    op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
                    LOG.info("No need for sample filter");
                } else {
                    // The table is not bucketed, add a dummy filter :: rand()
                    int freq = conf.getIntVar(HiveConf.ConfVars.HIVETESTMODESAMPLEFREQ);
                    TableSample tsSample = new TableSample(1, freq);
                    tsSample.setInputPruning(false);
                    qb.getParseInfo().setTabSample(alias, tsSample);
                    LOG.info("Need sample filter");
                    ExprNodeDesc randFunc = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand", new ExprNodeConstantDesc(Integer.valueOf(460476415)));
                    ExprNodeDesc samplePred = genSamplePredicate(tsSample, null, false, alias, rwsch, qb.getMetaData(), randFunc);
                    FilterDesc filterDesc = new FilterDesc(samplePred, true);
                    filterDesc.setGenerated(true);
                    op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
                }
            }
        }
    }
    Operator output = putOpInsertMap(op, rwsch);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Created Table Plan for " + alias + " " + op.toString());
    }
    return output;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) Table(org.apache.hadoop.hive.ql.metadata.Table) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) SampleDesc(org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) VirtualColumn(org.apache.hadoop.hive.ql.metadata.VirtualColumn) StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 74 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SemanticAnalyzer method genPlan.

@SuppressWarnings("nls")
public Operator genPlan(QB qb, boolean skipAmbiguityCheck) throws SemanticException {
    // First generate all the opInfos for the elements in the from clause
    // Must be deterministic order map - see HIVE-8707
    Map<String, Operator> aliasToOpInfo = new LinkedHashMap<String, Operator>();
    // Recurse over the subqueries to fill the subquery part of the plan
    for (String alias : qb.getSubqAliases()) {
        QBExpr qbexpr = qb.getSubqForAlias(alias);
        Operator<?> operator = genPlan(qb, qbexpr);
        aliasToOpInfo.put(alias, operator);
        if (qb.getViewToTabSchema().containsKey(alias)) {
            // we set viewProjectToTableSchema so that we can leverage ColumnPruner.
            if (operator instanceof LimitOperator) {
                // If create view has LIMIT operator, this can happen
                // Fetch parent operator
                operator = operator.getParentOperators().get(0);
            }
            if (operator instanceof SelectOperator) {
                if (this.viewProjectToTableSchema == null) {
                    this.viewProjectToTableSchema = new LinkedHashMap<>();
                }
                viewProjectToTableSchema.put((SelectOperator) operator, qb.getViewToTabSchema().get(alias));
            } else {
                throw new SemanticException("View " + alias + " is corresponding to " + operator.getType().name() + ", rather than a SelectOperator.");
            }
        }
    }
    // Recurse over all the source tables
    for (String alias : qb.getTabAliases()) {
        if (alias.equals(DUMMY_TABLE)) {
            continue;
        }
        Operator op = genTablePlan(alias, qb);
        aliasToOpInfo.put(alias, op);
    }
    if (aliasToOpInfo.isEmpty()) {
        qb.getMetaData().setSrcForAlias(DUMMY_TABLE, getDummyTable());
        TableScanOperator op = (TableScanOperator) genTablePlan(DUMMY_TABLE, qb);
        op.getConf().setRowLimit(1);
        qb.addAlias(DUMMY_TABLE);
        qb.setTabAlias(DUMMY_TABLE, DUMMY_TABLE);
        aliasToOpInfo.put(DUMMY_TABLE, op);
    }
    Operator srcOpInfo = null;
    Operator lastPTFOp = null;
    if (queryProperties.hasPTF()) {
        // After processing subqueries and source tables, process
        // partitioned table functions
        HashMap<ASTNode, PTFInvocationSpec> ptfNodeToSpec = qb.getPTFNodeToSpec();
        if (ptfNodeToSpec != null) {
            for (Entry<ASTNode, PTFInvocationSpec> entry : ptfNodeToSpec.entrySet()) {
                ASTNode ast = entry.getKey();
                PTFInvocationSpec spec = entry.getValue();
                String inputAlias = spec.getQueryInputName();
                Operator inOp = aliasToOpInfo.get(inputAlias);
                if (inOp == null) {
                    throw new SemanticException(generateErrorMessage(ast, "Cannot resolve input Operator for PTF invocation"));
                }
                lastPTFOp = genPTFPlan(spec, inOp);
                String ptfAlias = spec.getFunction().getAlias();
                if (ptfAlias != null) {
                    aliasToOpInfo.put(ptfAlias, lastPTFOp);
                }
            }
        }
    }
    // For all the source tables that have a lateral view, attach the
    // appropriate operators to the TS
    genLateralViewPlans(aliasToOpInfo, qb);
    // process join
    if (qb.getParseInfo().getJoinExpr() != null) {
        ASTNode joinExpr = qb.getParseInfo().getJoinExpr();
        if (joinExpr.getToken().getType() == HiveParser.TOK_UNIQUEJOIN) {
            QBJoinTree joinTree = genUniqueJoinTree(qb, joinExpr, aliasToOpInfo);
            qb.setQbJoinTree(joinTree);
        } else {
            QBJoinTree joinTree = genJoinTree(qb, joinExpr, aliasToOpInfo);
            qb.setQbJoinTree(joinTree);
            /*
         * if there is only one destination in Query try to push where predicates
         * as Join conditions
         */
            Set<String> dests = qb.getParseInfo().getClauseNames();
            if (dests.size() == 1 && joinTree.getNoOuterJoin()) {
                String dest = dests.iterator().next();
                ASTNode whereClause = qb.getParseInfo().getWhrForClause(dest);
                if (whereClause != null) {
                    extractJoinCondsFromWhereClause(joinTree, qb, dest, (ASTNode) whereClause.getChild(0), aliasToOpInfo);
                }
            }
            if (!disableJoinMerge) {
                mergeJoinTree(qb);
            }
        }
        // if any filters are present in the join tree, push them on top of the
        // table
        pushJoinFilters(qb, qb.getQbJoinTree(), aliasToOpInfo);
        srcOpInfo = genJoinPlan(qb, aliasToOpInfo);
    } else {
        // Now if there are more than 1 sources then we have a join case
        // later we can extend this to the union all case as well
        srcOpInfo = aliasToOpInfo.values().iterator().next();
        // with ptfs, there maybe more (note for PTFChains:
        // 1 ptf invocation may entail multiple PTF operators)
        srcOpInfo = lastPTFOp != null ? lastPTFOp : srcOpInfo;
    }
    Operator bodyOpInfo = genBodyPlan(qb, srcOpInfo, aliasToOpInfo);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Created Plan for Query Block " + qb.getId());
    }
    if (qb.getAlias() != null) {
        rewriteRRForSubQ(qb.getAlias(), bodyOpInfo, skipAmbiguityCheck);
    }
    setQB(qb);
    return bodyOpInfo;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) LinkedHashMap(java.util.LinkedHashMap) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 75 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SetSparkReducerParallelism method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
    ReduceSinkOperator sink = (ReduceSinkOperator) nd;
    ReduceSinkDesc desc = sink.getConf();
    Set<ReduceSinkOperator> parentSinks = null;
    int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
    int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
    if (!useOpStats) {
        parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
        parentSinks.remove(sink);
        if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
            // We haven't processed all the parent sinks, and we need
            // them to be done in order to compute the parallelism for this sink.
            // In this case, skip. We should visit this again from another path.
            LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
            return false;
        }
    }
    if (context.getVisitedReduceSinks().contains(sink)) {
        // skip walking the children
        LOG.debug("Already processed reduce sink: " + sink.getName());
        return true;
    }
    context.getVisitedReduceSinks().add(sink);
    if (needSetParallelism(sink, context.getConf())) {
        if (constantReducers > 0) {
            LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
            desc.setNumReducers(constantReducers);
        } else {
            // If it's a FileSink to bucketed files, use the bucket count as the reducer number
            FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
            if (fso != null) {
                String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
                int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
                if (numBuckets > 0) {
                    LOG.info("Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
                    desc.setNumReducers(numBuckets);
                    return false;
                }
            }
            if (useOpStats || parentSinks.isEmpty()) {
                long numberOfBytes = 0;
                if (useOpStats) {
                    // we need to add up all the estimates from the siblings of this reduce sink
                    for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                        if (sibling.getStatistics() != null) {
                            numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
                            }
                        } else {
                            LOG.warn("No stats available from: " + sibling);
                        }
                    }
                } else {
                    // we should use TS stats to infer parallelism
                    for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                        Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
                        for (TableScanOperator source : sources) {
                            if (source.getStatistics() != null) {
                                numberOfBytes = StatsUtils.safeAdd(numberOfBytes, source.getStatistics().getDataSize());
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
                                }
                            } else {
                                LOG.warn("No stats available from table source: " + source);
                            }
                        }
                    }
                    LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
                }
                // Divide it by 2 so that we can have more reducers
                long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
                int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
                getSparkMemoryAndCores(context);
                if (sparkMemoryAndCores != null && sparkMemoryAndCores.getFirst() > 0 && sparkMemoryAndCores.getSecond() > 0) {
                    // warn the user if bytes per reducer is much larger than memory per task
                    if ((double) sparkMemoryAndCores.getFirst() / bytesPerReducer < 0.5) {
                        LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
                    }
                    // If there are more cores, use the number of cores
                    numReducers = Math.max(numReducers, sparkMemoryAndCores.getSecond());
                }
                numReducers = Math.min(numReducers, maxReducers);
                LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
                desc.setNumReducers(numReducers);
            } else {
                // Use the maximum parallelism from all parent reduce sinks
                int numberOfReducers = 0;
                for (ReduceSinkOperator parent : parentSinks) {
                    numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
                }
                desc.setNumReducers(numberOfReducers);
                LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
            }
            final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> keyCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
            final Collection<ExprNodeDesc.ExprNodeDescEqualityWrapper> partCols = ExprNodeDesc.ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
            if (keyCols != null && keyCols.equals(partCols)) {
                desc.setReducerTraits(EnumSet.of(UNIFORM));
            }
        }
    } else {
        LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
    }
    return false;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)88 Operator (org.apache.hadoop.hive.ql.exec.Operator)35 ArrayList (java.util.ArrayList)33 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)28 Table (org.apache.hadoop.hive.ql.metadata.Table)21 HashMap (java.util.HashMap)20 Path (org.apache.hadoop.fs.Path)20 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)20 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)19 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)19 LinkedHashMap (java.util.LinkedHashMap)18 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)18 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)18 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)15 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)15 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)15 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)14 Map (java.util.Map)13 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)12 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)12