Search in sources :

Example 61 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class NullScanTaskDispatcher method processAlias.

private void processAlias(MapWork work, HashSet<TableScanOperator> tableScans) {
    ArrayList<String> aliases = new ArrayList<String>();
    for (TableScanOperator tso : tableScans) {
        // should not apply this for non-native table
        if (tso.getConf().getTableMetadata().getStorageHandler() != null) {
            continue;
        }
        String alias = getAliasForTableScanOperator(work, tso);
        aliases.add(alias);
        tso.getConf().setIsMetadataOnly(true);
    }
    // group path alias according to work
    LinkedHashMap<Path, ArrayList<String>> candidates = new LinkedHashMap<>();
    for (Path path : work.getPaths()) {
        ArrayList<String> aliasesAffected = work.getPathToAliases().get(path);
        if (aliasesAffected != null && aliasesAffected.size() > 0) {
            candidates.put(path, aliasesAffected);
        }
    }
    for (Entry<Path, ArrayList<String>> entry : candidates.entrySet()) {
        processAlias(work, entry.getKey(), entry.getValue(), aliases);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap)

Example 62 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SamplingOptimizer method resolve.

public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
    for (Task<?> task : pctx.getRootTasks()) {
        if (!(task instanceof MapRedTask) || !((MapRedTask) task).getWork().isFinalMapRed()) {
            // this could be replaced by bucketing on RS + bucketed fetcher for next MR
            continue;
        }
        MapredWork mrWork = ((MapRedTask) task).getWork();
        MapWork mapWork = mrWork.getMapWork();
        ReduceWork reduceWork = mrWork.getReduceWork();
        if (reduceWork == null || reduceWork.getNumReduceTasks() != 1 || mapWork.getAliasToWork().size() != 1 || mapWork.getSamplingType() > 0 || reduceWork.getReducer() == null) {
            continue;
        }
        // GROUPBY operator in reducer may not be processed in parallel. Skip optimizing.
        if (OperatorUtils.findSingleOperator(reduceWork.getReducer(), GroupByOperator.class) != null) {
            continue;
        }
        Operator<?> operator = mapWork.getAliasToWork().values().iterator().next();
        if (!(operator instanceof TableScanOperator)) {
            continue;
        }
        ReduceSinkOperator child = OperatorUtils.findSingleOperator(operator, ReduceSinkOperator.class);
        if (child == null || child.getConf().getNumReducers() != 1 || !child.getConf().getPartitionCols().isEmpty()) {
            continue;
        }
        child.getConf().setNumReducers(-1);
        reduceWork.setNumReduceTasks(-1);
        mapWork.setSamplingType(MapWork.SAMPLING_ON_START);
    }
    return pctx;
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork)

Example 63 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SemanticAnalyzer method genPlan.

@SuppressWarnings("nls")
public Operator genPlan(QB qb, boolean skipAmbiguityCheck) throws SemanticException {
    // First generate all the opInfos for the elements in the from clause
    // Must be deterministic order map - see HIVE-8707
    Map<String, Operator> aliasToOpInfo = new LinkedHashMap<String, Operator>();
    // Recurse over the subqueries to fill the subquery part of the plan
    for (String alias : qb.getSubqAliases()) {
        QBExpr qbexpr = qb.getSubqForAlias(alias);
        Operator operator = genPlan(qb, qbexpr);
        aliasToOpInfo.put(alias, operator);
        if (qb.getViewToTabSchema().containsKey(alias)) {
            // we set viewProjectToTableSchema so that we can leverage ColumnPruner.
            if (operator instanceof SelectOperator) {
                if (this.viewProjectToTableSchema == null) {
                    this.viewProjectToTableSchema = new LinkedHashMap<>();
                }
                viewProjectToTableSchema.put((SelectOperator) operator, qb.getViewToTabSchema().get(alias));
            } else {
                throw new SemanticException("View " + alias + " is corresponding to " + operator.getType().name() + ", rather than a SelectOperator.");
            }
        }
    }
    // Recurse over all the source tables
    for (String alias : qb.getTabAliases()) {
        Operator op = genTablePlan(alias, qb);
        aliasToOpInfo.put(alias, op);
    }
    if (aliasToOpInfo.isEmpty()) {
        qb.getMetaData().setSrcForAlias(DUMMY_TABLE, getDummyTable());
        TableScanOperator op = (TableScanOperator) genTablePlan(DUMMY_TABLE, qb);
        op.getConf().setRowLimit(1);
        qb.addAlias(DUMMY_TABLE);
        qb.setTabAlias(DUMMY_TABLE, DUMMY_TABLE);
        aliasToOpInfo.put(DUMMY_TABLE, op);
    }
    Operator srcOpInfo = null;
    Operator lastPTFOp = null;
    if (queryProperties.hasPTF()) {
        //After processing subqueries and source tables, process
        // partitioned table functions
        HashMap<ASTNode, PTFInvocationSpec> ptfNodeToSpec = qb.getPTFNodeToSpec();
        if (ptfNodeToSpec != null) {
            for (Entry<ASTNode, PTFInvocationSpec> entry : ptfNodeToSpec.entrySet()) {
                ASTNode ast = entry.getKey();
                PTFInvocationSpec spec = entry.getValue();
                String inputAlias = spec.getQueryInputName();
                Operator inOp = aliasToOpInfo.get(inputAlias);
                if (inOp == null) {
                    throw new SemanticException(generateErrorMessage(ast, "Cannot resolve input Operator for PTF invocation"));
                }
                lastPTFOp = genPTFPlan(spec, inOp);
                String ptfAlias = spec.getFunction().getAlias();
                if (ptfAlias != null) {
                    aliasToOpInfo.put(ptfAlias, lastPTFOp);
                }
            }
        }
    }
    // For all the source tables that have a lateral view, attach the
    // appropriate operators to the TS
    genLateralViewPlans(aliasToOpInfo, qb);
    // process join
    if (qb.getParseInfo().getJoinExpr() != null) {
        ASTNode joinExpr = qb.getParseInfo().getJoinExpr();
        if (joinExpr.getToken().getType() == HiveParser.TOK_UNIQUEJOIN) {
            QBJoinTree joinTree = genUniqueJoinTree(qb, joinExpr, aliasToOpInfo);
            qb.setQbJoinTree(joinTree);
        } else {
            QBJoinTree joinTree = genJoinTree(qb, joinExpr, aliasToOpInfo);
            qb.setQbJoinTree(joinTree);
            /*
         * if there is only one destination in Query try to push where predicates
         * as Join conditions
         */
            Set<String> dests = qb.getParseInfo().getClauseNames();
            if (dests.size() == 1 && joinTree.getNoOuterJoin()) {
                String dest = dests.iterator().next();
                ASTNode whereClause = qb.getParseInfo().getWhrForClause(dest);
                if (whereClause != null) {
                    extractJoinCondsFromWhereClause(joinTree, qb, dest, (ASTNode) whereClause.getChild(0), aliasToOpInfo);
                }
            }
            if (!disableJoinMerge) {
                mergeJoinTree(qb);
            }
        }
        // if any filters are present in the join tree, push them on top of the
        // table
        pushJoinFilters(qb, qb.getQbJoinTree(), aliasToOpInfo);
        srcOpInfo = genJoinPlan(qb, aliasToOpInfo);
    } else {
        // Now if there are more than 1 sources then we have a join case
        // later we can extend this to the union all case as well
        srcOpInfo = aliasToOpInfo.values().iterator().next();
        // with ptfs, there maybe more (note for PTFChains:
        // 1 ptf invocation may entail multiple PTF operators)
        srcOpInfo = lastPTFOp != null ? lastPTFOp : srcOpInfo;
    }
    Operator bodyOpInfo = genBodyPlan(qb, srcOpInfo, aliasToOpInfo);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Created Plan for Query Block " + qb.getId());
    }
    if (qb.getAlias() != null) {
        rewriteRRForSubQ(qb.getAlias(), bodyOpInfo, skipAmbiguityCheck);
    }
    setQB(qb);
    return bodyOpInfo;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) LinkedHashMap(java.util.LinkedHashMap) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 64 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class SemanticAnalyzer method genTablePlan.

@SuppressWarnings("nls")
private Operator genTablePlan(String alias, QB qb) throws SemanticException {
    String alias_id = getAliasId(alias, qb);
    Table tab = qb.getMetaData().getSrcForAlias(alias);
    RowResolver rwsch;
    // is the table already present
    TableScanOperator top = topOps.get(alias_id);
    // Obtain table props in query
    Map<String, String> properties = qb.getTabPropsForAlias(alias);
    if (top == null) {
        // Determine row schema for TSOP.
        // Include column names from SerDe, the partition and virtual columns.
        rwsch = new RowResolver();
        try {
            // Including parameters passed in the query
            if (properties != null) {
                for (Entry<String, String> prop : properties.entrySet()) {
                    if (tab.getSerdeParam(prop.getKey()) != null) {
                        LOG.warn("SerDe property in input query overrides stored SerDe property");
                    }
                    tab.setSerdeParam(prop.getKey(), prop.getValue());
                }
            }
            // Obtain inspector for schema
            StructObjectInspector rowObjectInspector = (StructObjectInspector) tab.getDeserializer().getObjectInspector();
            List<? extends StructField> fields = rowObjectInspector.getAllStructFieldRefs();
            for (int i = 0; i < fields.size(); i++) {
                /**
           * if the column is a skewed column, use ColumnInfo accordingly
           */
                ColumnInfo colInfo = new ColumnInfo(fields.get(i).getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(fields.get(i).getFieldObjectInspector()), alias, false);
                colInfo.setSkewedCol((isSkewedCol(alias, qb, fields.get(i).getFieldName())) ? true : false);
                rwsch.put(alias, fields.get(i).getFieldName(), colInfo);
            }
        } catch (SerDeException e) {
            throw new RuntimeException(e);
        }
        // Finally add the partitioning columns
        for (FieldSchema part_col : tab.getPartCols()) {
            LOG.trace("Adding partition col: " + part_col);
            rwsch.put(alias, part_col.getName(), new ColumnInfo(part_col.getName(), TypeInfoFactory.getPrimitiveTypeInfo(part_col.getType()), alias, true));
        }
        // put all virtual columns in RowResolver.
        Iterator<VirtualColumn> vcs = VirtualColumn.getRegistry(conf).iterator();
        // use a list for easy cumtomize
        List<VirtualColumn> vcList = new ArrayList<VirtualColumn>();
        while (vcs.hasNext()) {
            VirtualColumn vc = vcs.next();
            rwsch.put(alias, vc.getName().toLowerCase(), new ColumnInfo(vc.getName(), vc.getTypeInfo(), alias, true, vc.getIsHidden()));
            vcList.add(vc);
        }
        // Create the root of the operator tree
        TableScanDesc tsDesc = new TableScanDesc(alias, vcList, tab);
        setupStats(tsDesc, qb.getParseInfo(), tab, alias, rwsch);
        SplitSample sample = nameToSplitSample.get(alias_id);
        if (sample != null && sample.getRowCount() != null) {
            tsDesc.setRowLimit(sample.getRowCount());
            nameToSplitSample.remove(alias_id);
        }
        top = (TableScanOperator) putOpInsertMap(OperatorFactory.get(getOpContext(), tsDesc, new RowSchema(rwsch.getColumnInfos())), rwsch);
        // Set insiderView so that we can skip the column authorization for this.
        top.setInsideView(qb.isInsideView() || qb.getAliasInsideView().contains(alias.toLowerCase()));
        // Add this to the list of top operators - we always start from a table
        // scan
        topOps.put(alias_id, top);
        // Add a mapping from the table scan operator to Table
        topToTable.put(top, tab);
        if (properties != null) {
            topToTableProps.put(top, properties);
            tsDesc.setOpProps(properties);
        }
    } else {
        rwsch = opParseCtx.get(top).getRowResolver();
        top.setChildOperators(null);
    }
    // check if this table is sampled and needs more than input pruning
    Operator<? extends OperatorDesc> op = top;
    TableSample ts = qb.getParseInfo().getTabSample(alias);
    if (ts != null) {
        TableScanOperator tableScanOp = top;
        tableScanOp.getConf().setTableSample(ts);
        int num = ts.getNumerator();
        int den = ts.getDenominator();
        ArrayList<ASTNode> sampleExprs = ts.getExprs();
        // TODO: Do the type checking of the expressions
        List<String> tabBucketCols = tab.getBucketCols();
        int numBuckets = tab.getNumBuckets();
        // If there are no sample cols and no bucket cols then throw an error
        if (tabBucketCols.size() == 0 && sampleExprs.size() == 0) {
            throw new SemanticException(ErrorMsg.NON_BUCKETED_TABLE.getMsg() + " " + tab.getTableName());
        }
        if (num > den) {
            throw new SemanticException(ErrorMsg.BUCKETED_NUMERATOR_BIGGER_DENOMINATOR.getMsg() + " " + tab.getTableName());
        }
        // check if a predicate is needed
        // predicate is needed if either input pruning is not enough
        // or if input pruning is not possible
        // check if the sample columns are the same as the table bucket columns
        boolean colsEqual = true;
        if ((sampleExprs.size() != tabBucketCols.size()) && (sampleExprs.size() != 0)) {
            colsEqual = false;
        }
        for (int i = 0; i < sampleExprs.size() && colsEqual; i++) {
            boolean colFound = false;
            for (int j = 0; j < tabBucketCols.size() && !colFound; j++) {
                if (sampleExprs.get(i).getToken().getType() != HiveParser.TOK_TABLE_OR_COL) {
                    break;
                }
                if (((ASTNode) sampleExprs.get(i).getChild(0)).getText().equalsIgnoreCase(tabBucketCols.get(j))) {
                    colFound = true;
                }
            }
            colsEqual = (colsEqual && colFound);
        }
        // Check if input can be pruned
        ts.setInputPruning((sampleExprs == null || sampleExprs.size() == 0 || colsEqual));
        // check if input pruning is enough
        if ((sampleExprs == null || sampleExprs.size() == 0 || colsEqual) && (num == den || (den % numBuckets == 0 || numBuckets % den == 0))) {
            // input pruning is enough; add the filter for the optimizer to use it
            // later
            LOG.info("No need for sample filter");
            ExprNodeDesc samplePredicate = genSamplePredicate(ts, tabBucketCols, colsEqual, alias, rwsch, qb.getMetaData(), null);
            FilterDesc filterDesc = new FilterDesc(samplePredicate, true, new SampleDesc(ts.getNumerator(), ts.getDenominator(), tabBucketCols, true));
            filterDesc.setGenerated(true);
            op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
        } else {
            // need to add filter
            // create tableOp to be filterDesc and set as child to 'top'
            LOG.info("Need sample filter");
            ExprNodeDesc samplePredicate = genSamplePredicate(ts, tabBucketCols, colsEqual, alias, rwsch, qb.getMetaData(), null);
            FilterDesc filterDesc = new FilterDesc(samplePredicate, true);
            filterDesc.setGenerated(true);
            op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
        }
    } else {
        boolean testMode = conf.getBoolVar(HiveConf.ConfVars.HIVETESTMODE);
        if (testMode) {
            String tabName = tab.getTableName();
            // has the user explicitly asked not to sample this table
            String unSampleTblList = conf.getVar(HiveConf.ConfVars.HIVETESTMODENOSAMPLE);
            String[] unSampleTbls = unSampleTblList.split(",");
            boolean unsample = false;
            for (String unSampleTbl : unSampleTbls) {
                if (tabName.equalsIgnoreCase(unSampleTbl)) {
                    unsample = true;
                }
            }
            if (!unsample) {
                int numBuckets = tab.getNumBuckets();
                // If the input table is bucketed, choose the first bucket
                if (numBuckets > 0) {
                    TableSample tsSample = new TableSample(1, numBuckets);
                    tsSample.setInputPruning(true);
                    qb.getParseInfo().setTabSample(alias, tsSample);
                    ExprNodeDesc samplePred = genSamplePredicate(tsSample, tab.getBucketCols(), true, alias, rwsch, qb.getMetaData(), null);
                    FilterDesc filterDesc = new FilterDesc(samplePred, true, new SampleDesc(tsSample.getNumerator(), tsSample.getDenominator(), tab.getBucketCols(), true));
                    filterDesc.setGenerated(true);
                    op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
                    LOG.info("No need for sample filter");
                } else {
                    // The table is not bucketed, add a dummy filter :: rand()
                    int freq = conf.getIntVar(HiveConf.ConfVars.HIVETESTMODESAMPLEFREQ);
                    TableSample tsSample = new TableSample(1, freq);
                    tsSample.setInputPruning(false);
                    qb.getParseInfo().setTabSample(alias, tsSample);
                    LOG.info("Need sample filter");
                    ExprNodeDesc randFunc = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("rand", new ExprNodeConstantDesc(Integer.valueOf(460476415)));
                    ExprNodeDesc samplePred = genSamplePredicate(tsSample, null, false, alias, rwsch, qb.getMetaData(), randFunc);
                    FilterDesc filterDesc = new FilterDesc(samplePred, true);
                    filterDesc.setGenerated(true);
                    op = OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(rwsch.getColumnInfos()), top);
                }
            }
        }
    }
    Operator output = putOpInsertMap(op, rwsch);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Created Table Plan for " + alias + " " + op.toString());
    }
    return output;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) Table(org.apache.hadoop.hive.ql.metadata.Table) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) SampleDesc(org.apache.hadoop.hive.ql.plan.FilterDesc.SampleDesc) FilterDesc(org.apache.hadoop.hive.ql.plan.FilterDesc) VirtualColumn(org.apache.hadoop.hive.ql.metadata.VirtualColumn) StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)64 ArrayList (java.util.ArrayList)23 Operator (org.apache.hadoop.hive.ql.exec.Operator)23 Path (org.apache.hadoop.fs.Path)18 Table (org.apache.hadoop.hive.ql.metadata.Table)17 HashMap (java.util.HashMap)15 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)15 LinkedHashMap (java.util.LinkedHashMap)14 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)13 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)12 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)12 Map (java.util.Map)11 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)10 Serializable (java.io.Serializable)9 Task (org.apache.hadoop.hive.ql.exec.Task)9 Partition (org.apache.hadoop.hive.ql.metadata.Partition)9 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)9 List (java.util.List)8