Search in sources :

Example 46 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class MapredLocalTask method initializeOperators.

private void initializeOperators(Map<FetchOperator, JobConf> fetchOpJobConfMap) throws HiveException {
    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : work.getAliasToWork().entrySet()) {
        LOG.debug("initializeOperators: " + entry.getKey() + ", children = " + entry.getValue().getChildOperators());
    }
    // this mapper operator is used to initialize all the operators
    for (Map.Entry<String, FetchWork> entry : work.getAliasToFetchWork().entrySet()) {
        if (entry.getValue() == null) {
            continue;
        }
        JobConf jobClone = new JobConf(job);
        TableScanOperator ts = (TableScanOperator) work.getAliasToWork().get(entry.getKey());
        // push down projections
        ColumnProjectionUtils.appendReadColumns(jobClone, ts.getNeededColumnIDs(), ts.getNeededColumns(), ts.getNeededNestedColumnPaths());
        // push down filters
        HiveInputFormat.pushFilters(jobClone, ts, null);
        AcidUtils.setAcidOperationalProperties(jobClone, ts.getConf().isTranscationalTable(), ts.getConf().getAcidOperationalProperties());
        AcidUtils.setValidWriteIdList(jobClone, ts.getConf());
        // create a fetch operator
        FetchOperator fetchOp = new FetchOperator(entry.getValue(), jobClone);
        fetchOpJobConfMap.put(fetchOp, jobClone);
        fetchOperators.put(entry.getKey(), fetchOp);
        l4j.info("fetchoperator for " + entry.getKey() + " created");
    }
    // initialize all forward operator
    for (Map.Entry<String, FetchOperator> entry : fetchOperators.entrySet()) {
        // get the forward op
        String alias = entry.getKey();
        Operator<? extends OperatorDesc> forwardOp = work.getAliasToWork().get(alias);
        // put the exe context into all the operators
        forwardOp.passExecContext(execContext);
        // All the operators need to be initialized before process
        FetchOperator fetchOp = entry.getValue();
        JobConf jobConf = fetchOpJobConfMap.get(fetchOp);
        if (jobConf == null) {
            jobConf = job;
        }
        // initialize the forward operator
        ObjectInspector objectInspector = fetchOp.getOutputObjectInspector();
        forwardOp.initialize(jobConf, new ObjectInspector[] { objectInspector });
        l4j.info("fetchoperator for " + entry.getKey() + " initialized");
    }
}
Also used : FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) Map(java.util.Map) HashMap(java.util.HashMap) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JobConf(org.apache.hadoop.mapred.JobConf) FetchOperator(org.apache.hadoop.hive.ql.exec.FetchOperator)

Example 47 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class TestGenTezWork method setUp.

/**
 * @throws java.lang.Exception
 */
@SuppressWarnings("unchecked")
@Before
public void setUp() throws Exception {
    // Init conf
    final HiveConf conf = new HiveConf(SemanticAnalyzer.class);
    SessionState.start(conf);
    // Init parse context
    final ParseContext pctx = new ParseContext();
    pctx.setContext(new Context(conf));
    ctx = new GenTezProcContext(conf, pctx, Collections.EMPTY_LIST, new ArrayList<Task<? extends Serializable>>(), Collections.EMPTY_SET, Collections.EMPTY_SET);
    proc = new GenTezWork(new GenTezUtils() {

        @Override
        protected void setupMapWork(MapWork mapWork, GenTezProcContext context, PrunedPartitionList partitions, TableScanOperator root, String alias) throws SemanticException {
            LinkedHashMap<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
            map.put("foo", root);
            mapWork.setAliasToWork(map);
            return;
        }
    });
    CompilationOpContext cCtx = new CompilationOpContext();
    fs = new FileSinkOperator(cCtx);
    fs.setConf(new FileSinkDesc());
    rs = new ReduceSinkOperator(cCtx);
    rs.setConf(new ReduceSinkDesc());
    TableDesc tableDesc = new TableDesc();
    tableDesc.setProperties(new Properties());
    rs.getConf().setKeySerializeInfo(tableDesc);
    ts = new TableScanOperator(cCtx);
    ts.setConf(new TableScanDesc(null));
    ts.getChildOperators().add(rs);
    rs.getParentOperators().add(ts);
    rs.getChildOperators().add(fs);
    fs.getParentOperators().add(rs);
    ctx.preceedingWork = null;
    ctx.currentRootOperator = ts;
}
Also used : Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) ArrayList(java.util.ArrayList) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) HiveConf(org.apache.hadoop.hive.conf.HiveConf) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) Before(org.junit.Before)

Example 48 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class NullScanTaskDispatcher method processAlias.

private void processAlias(MapWork work, HashSet<TableScanOperator> tableScans) {
    ArrayList<String> aliases = new ArrayList<String>();
    for (TableScanOperator tso : tableScans) {
        // should not apply this for non-native table
        if (tso.getConf().getTableMetadata().getStorageHandler() != null) {
            continue;
        }
        String alias = getAliasForTableScanOperator(work, tso);
        aliases.add(alias);
        tso.getConf().setIsMetadataOnly(true);
    }
    // group path alias according to work
    LinkedHashMap<Path, ArrayList<String>> candidates = new LinkedHashMap<>();
    for (Path path : work.getPaths()) {
        ArrayList<String> aliasesAffected = work.getPathToAliases().get(path);
        if (aliasesAffected != null && aliasesAffected.size() > 0) {
            candidates.put(path, aliasesAffected);
        }
    }
    for (Entry<Path, ArrayList<String>> entry : candidates.entrySet()) {
        processAlias(work, entry.getKey(), entry.getValue(), aliases);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap)

Example 49 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class IndexWhereProcessor method process.

@Override
public /**
   * Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
   */
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator operator = (TableScanOperator) nd;
    List<Node> opChildren = operator.getChildren();
    TableScanDesc operatorDesc = operator.getConf();
    if (operatorDesc == null || !tsToIndices.containsKey(operator)) {
        return null;
    }
    List<Index> indexes = tsToIndices.get(operator);
    ExprNodeDesc predicate = operatorDesc.getFilterExpr();
    IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
    ParseContext pctx = context.getParseContext();
    LOG.info("Processing predicate for index optimization");
    if (predicate == null) {
        LOG.info("null predicate pushed down");
        return null;
    }
    LOG.info(predicate.getExprString());
    // check if we have tsToIndices on all partitions in this table scan
    Set<Partition> queryPartitions;
    try {
        queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes);
        if (queryPartitions == null) {
            // partitions not covered
            return null;
        }
    } catch (HiveException e) {
        LOG.error("Fatal Error: problem accessing metastore", e);
        throw new SemanticException(e);
    }
    // we can only process MapReduce tasks to check input size
    if (!context.getCurrentTask().isMapRedTask()) {
        return null;
    }
    MapRedTask currentTask = (MapRedTask) context.getCurrentTask();
    // get potential reentrant index queries from each index
    Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
    // make sure we have an index on the table being scanned
    TableDesc tblDesc = operator.getTableDesc();
    Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
    for (Index indexOnTable : indexes) {
        if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
            List<Index> newType = new ArrayList<Index>();
            newType.add(indexOnTable);
            indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
        } else {
            indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
        }
    }
    // choose index type with most tsToIndices of the same type on the table
    // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
    List<Index> bestIndexes = indexesByType.values().iterator().next();
    for (List<Index> indexTypes : indexesByType.values()) {
        if (bestIndexes.size() < indexTypes.size()) {
            bestIndexes = indexTypes;
        }
    }
    // rewrite index queries for the chosen index type
    HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
    tmpQueryContext.setQueryPartitions(queryPartitions);
    rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
    List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();
    if (indexTasks != null && indexTasks.size() > 0) {
        queryContexts.put(bestIndexes.get(0), tmpQueryContext);
    }
    // choose an index rewrite to use
    if (queryContexts.size() > 0) {
        // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
        Index chosenIndex = queryContexts.keySet().iterator().next();
        // modify the parse context to use indexing
        // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
        HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);
        // prepare the map reduce job to use indexing
        MapWork work = currentTask.getWork().getMapWork();
        work.setInputformat(queryContext.getIndexInputFormat());
        work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
        // modify inputs based on index query
        Set<ReadEntity> inputs = pctx.getSemanticInputs();
        inputs.addAll(queryContext.getAdditionalSemanticInputs());
        List<Task<?>> chosenRewrite = queryContext.getQueryTasks();
        // add dependencies so index query runs first
        insertIndexQuery(pctx, context, chosenRewrite);
    }
    return null;
}
Also used : HiveIndexQueryContext(org.apache.hadoop.hive.ql.index.HiveIndexQueryContext) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashMap(java.util.HashMap) Node(org.apache.hadoop.hive.ql.lib.Node) ArrayList(java.util.ArrayList) Index(org.apache.hadoop.hive.metastore.api.Index) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 50 with TableScanOperator

use of org.apache.hadoop.hive.ql.exec.TableScanOperator in project hive by apache.

the class IndexWhereTaskDispatcher method createOperatorRules.

/**
   * Create a set of rules that only matches WHERE predicates on columns we have
   * an index on.
   * @return
   */
private Map<Rule, NodeProcessor> createOperatorRules(ParseContext pctx) throws SemanticException {
    Map<Rule, NodeProcessor> operatorRules = new LinkedHashMap<Rule, NodeProcessor>();
    List<String> supportedIndexes = new ArrayList<String>();
    supportedIndexes.add(CompactIndexHandler.class.getName());
    supportedIndexes.add(BitmapIndexHandler.class.getName());
    // query the metastore to know what columns we have indexed
    Map<TableScanOperator, List<Index>> indexes = new HashMap<TableScanOperator, List<Index>>();
    for (Operator<? extends OperatorDesc> op : pctx.getTopOps().values()) {
        if (op instanceof TableScanOperator) {
            List<Index> tblIndexes = IndexUtils.getIndexes(((TableScanOperator) op).getConf().getTableMetadata(), supportedIndexes);
            if (tblIndexes.size() > 0) {
                indexes.put((TableScanOperator) op, tblIndexes);
            }
        }
    }
    // quit if our tables don't have any indexes
    if (indexes.size() == 0) {
        return null;
    }
    // We set the pushed predicate from the WHERE clause as the filter expr on
    // all table scan operators, so we look for table scan operators(TS%)
    operatorRules.put(new RuleRegExp("RULEWhere", TableScanOperator.getOperatorName() + "%"), new IndexWhereProcessor(indexes));
    return operatorRules;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) CompactIndexHandler(org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler) Index(org.apache.hadoop.hive.metastore.api.Index) LinkedHashMap(java.util.LinkedHashMap) BitmapIndexHandler(org.apache.hadoop.hive.ql.index.bitmap.BitmapIndexHandler) ArrayList(java.util.ArrayList) List(java.util.List) Rule(org.apache.hadoop.hive.ql.lib.Rule)

Aggregations

TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)88 Operator (org.apache.hadoop.hive.ql.exec.Operator)35 ArrayList (java.util.ArrayList)33 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)28 Table (org.apache.hadoop.hive.ql.metadata.Table)21 HashMap (java.util.HashMap)20 Path (org.apache.hadoop.fs.Path)20 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)20 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)19 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)19 LinkedHashMap (java.util.LinkedHashMap)18 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)18 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)18 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)15 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)15 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)15 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)14 Map (java.util.Map)13 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)12 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)12