Search in sources :

Example 21 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class SparkCompiler method generateTaskTree.

/**
 * TODO: need to turn on rules that's commented out and add more if necessary.
 */
@Override
protected void generateTaskTree(List<Task<?>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException {
    PERF_LOGGER.perfLogBegin(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
    GenSparkUtils utils = GenSparkUtils.getUtils();
    utils.resetSequenceNumber();
    ParseContext tempParseContext = getParseContext(pCtx, rootTasks);
    GenSparkProcContext procCtx = new GenSparkProcContext(conf, tempParseContext, mvTask, rootTasks, inputs, outputs, pCtx.getTopOps());
    // -------------------------------- First Pass ---------------------------------- //
    // Identify SparkPartitionPruningSinkOperators, and break OP tree if necessary
    Map<SemanticRule, SemanticNodeProcessor> opRules = new LinkedHashMap<SemanticRule, SemanticNodeProcessor>();
    opRules.put(new RuleRegExp("Clone OP tree for PartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), new SplitOpTreeForDPP());
    SemanticDispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
    SemanticGraphWalker ogw = new GenSparkWorkWalker(disp, procCtx);
    List<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pCtx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    // -------------------------------- Second Pass ---------------------------------- //
    // Process operator tree in two steps: first we process the extra op trees generated
    // in the first pass. Then we process the main op tree, and the result task will depend
    // on the task generated in the first pass.
    topNodes.clear();
    topNodes.addAll(procCtx.topOps.values());
    generateTaskTreeHelper(procCtx, topNodes);
    // the partitions used.
    if (!procCtx.clonedPruningTableScanSet.isEmpty()) {
        SparkTask pruningTask = SparkUtilities.createSparkTask(conf);
        SparkTask mainTask = procCtx.currentTask;
        pruningTask.addDependentTask(procCtx.currentTask);
        procCtx.rootTasks.remove(procCtx.currentTask);
        procCtx.rootTasks.add(pruningTask);
        procCtx.currentTask = pruningTask;
        topNodes.clear();
        topNodes.addAll(procCtx.clonedPruningTableScanSet);
        generateTaskTreeHelper(procCtx, topNodes);
        procCtx.currentTask = mainTask;
    }
    // we need to clone some operator plans and remove union operators still
    for (BaseWork w : procCtx.workWithUnionOperators) {
        GenSparkUtils.getUtils().removeUnionOperators(procCtx, w);
    }
    // we need to fill MapWork with 'local' work and bucket information for SMB Join.
    GenSparkUtils.getUtils().annotateMapWork(procCtx);
    // finally make sure the file sink operators are set up right
    for (FileSinkOperator fileSink : procCtx.fileSinkSet) {
        GenSparkUtils.getUtils().processFileSink(procCtx, fileSink);
    }
    // Process partition pruning sinks
    for (Operator<?> prunerSink : procCtx.pruningSinkSet) {
        utils.processPartitionPruningSink(procCtx, (SparkPartitionPruningSinkOperator) prunerSink);
    }
    PERF_LOGGER.perfLogEnd(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
}
Also used : SemanticRule(org.apache.hadoop.hive.ql.lib.SemanticRule) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Node(org.apache.hadoop.hive.ql.lib.Node) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) ArrayList(java.util.ArrayList) SemanticGraphWalker(org.apache.hadoop.hive.ql.lib.SemanticGraphWalker) LinkedHashMap(java.util.LinkedHashMap) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) SemanticDispatcher(org.apache.hadoop.hive.ql.lib.SemanticDispatcher) SemanticNodeProcessor(org.apache.hadoop.hive.ql.lib.SemanticNodeProcessor) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 22 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class SparkProcessAnalyzeTable method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenSparkProcContext context = (GenSparkProcContext) procContext;
    TableScanOperator tableScan = (TableScanOperator) nd;
    ParseContext parseContext = context.parseContext;
    Table table = tableScan.getConf().getTableMetadata();
    @SuppressWarnings("rawtypes") Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
    if (parseContext.getQueryProperties().isAnalyzeCommand()) {
        Preconditions.checkArgument(tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0, "AssertionError: expected tableScan.getChildOperators() to be null, " + "or tableScan.getChildOperators().size() to be 0");
        String alias = null;
        for (String a : parseContext.getTopOps().keySet()) {
            if (tableScan == parseContext.getTopOps().get(a)) {
                alias = a;
            }
        }
        Preconditions.checkArgument(alias != null, "AssertionError: expected alias to be not null");
        SparkWork sparkWork = context.currentTask.getWork();
        if (BasicStatsNoJobTask.canUseBasicStats(table, inputFormat)) {
            // For ORC, Parquet and Iceberg tables, all the following statements are the same
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any Spark job above this task
            StatsWork statWork = new StatsWork(table, parseContext.getConf());
            statWork.setFooterScan();
            // If partition is specified, get pruned partition list
            Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            if (confirmedParts.size() > 0) {
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
                statWork.addInputPartitions(partList.getPartitions());
            }
            Task<StatsWork> snjTask = TaskFactory.get(statWork);
            snjTask.setParentTasks(null);
            context.rootTasks.remove(context.currentTask);
            context.rootTasks.add(snjTask);
            return true;
        } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple SparkTask followed by a StatsTask.
            // The Spark task is just a simple TableScanOperator
            BasicStatsWork basicStatsWork = new BasicStatsWork(table.getTableSpec());
            basicStatsWork.setIsExplicitAnalyze(true);
            basicStatsWork.setNoScanAnalyzeCommand(parseContext.getQueryProperties().isNoScanAnalyzeCommand());
            StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, parseContext.getConf());
            columnStatsWork.collectStatsFromAggregator(tableScan.getConf());
            columnStatsWork.setSourceTask(context.currentTask);
            Task<StatsWork> statsTask = TaskFactory.get(columnStatsWork);
            context.currentTask.addDependentTask(statsTask);
            // The plan consists of a StatsTask only.
            if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
                statsTask.setParentTasks(null);
                context.rootTasks.remove(context.currentTask);
                context.rootTasks.add(statsTask);
            }
            // NOTE: here we should use the new partition predicate pushdown API to get a list of pruned list,
            // and pass it to setTaskPlan as the last parameter
            Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            PrunedPartitionList partitions = null;
            if (confirmedPartns.size() > 0) {
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                partitions = new PrunedPartitionList(table, confirmedPartns, partCols, false);
            }
            MapWork w = utils.createMapWork(context, tableScan, sparkWork, partitions);
            w.setGatheringStats(true);
            return true;
        }
    }
    return null;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork)

Example 23 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class TestSparkUtilities method testCreateMoveTaskDoesntCreateCascadeTempDirs.

@Test
public void testCreateMoveTaskDoesntCreateCascadeTempDirs() throws Exception {
    FileSinkOperator fsOp = mock(FileSinkOperator.class);
    ParseContext pctx = mock(ParseContext.class);
    Configuration conf = new Configuration();
    conf.set("_hive.hdfs.session.path", "hdfs:/dummypath");
    conf.set("_hive.local.session.path", "hdfs:/dummypath");
    Context ctx = new Context(conf);
    String executionId = ctx.getExecutionId();
    Context ctxSpy = spy(ctx);
    FileSinkDesc fileSinkDesc = mock(FileSinkDesc.class);
    Path mrPath = new Path("hdfs:/tmp/.staging/" + executionId + "/-mr-10001");
    Path mrPath2 = new Path("hdfs:/tmp/.staging/" + executionId + "/-mr-10002");
    Path extPath = new Path("hdfs:/tmp/.staging/" + executionId + "/-ext-10001");
    Path extPath2 = new Path("hdfs:/tmp/.staging/" + executionId + "/-ext-10002");
    final Ref<Path> expectedPathRef = new Ref<>(mrPath);
    final Ref<Path> testPathRef = new Ref<>(extPath);
    doAnswer(invocationOnMock -> {
        return ctxSpy;
    }).when(pctx).getContext();
    doAnswer(invocationOnMock -> {
        return mrPath2;
    }).when(ctxSpy).getMRTmpPath();
    doAnswer(invocationOnMock -> {
        return extPath2;
    }).when(ctxSpy).getExternalTmpPath(any(Path.class));
    doAnswer(invocationOnMock -> {
        return testPathRef.value;
    }).when(fileSinkDesc).getFinalDirName();
    doAnswer(invocationOnMock -> {
        return null;
    }).when(fileSinkDesc).getLinkedFileSinkDesc();
    doAnswer(invocationOnMock -> {
        return fileSinkDesc;
    }).when(fsOp).getConf();
    doAnswer(invocationOnMock -> {
        assertEquals(expectedPathRef.value, invocationOnMock.getArgument(0, Path.class));
        return null;
    }).when(fileSinkDesc).setDirName(any(Path.class));
    testPathRef.value = mrPath;
    expectedPathRef.value = mrPath2;
    GenSparkUtils.createMoveTask(null, true, fsOp, pctx, null, null, null);
    testPathRef.value = extPath;
    expectedPathRef.value = extPath2;
    GenSparkUtils.createMoveTask(null, true, fsOp, pctx, null, null, null);
}
Also used : Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Path(org.apache.hadoop.fs.Path) Ref(org.apache.hive.common.util.Ref) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) Configuration(org.apache.hadoop.conf.Configuration) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Test(org.junit.Test)

Example 24 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class IndexWhereProcessor method process.

@Override
public /**
   * Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher
   */
Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator operator = (TableScanOperator) nd;
    List<Node> opChildren = operator.getChildren();
    TableScanDesc operatorDesc = operator.getConf();
    if (operatorDesc == null || !tsToIndices.containsKey(operator)) {
        return null;
    }
    List<Index> indexes = tsToIndices.get(operator);
    ExprNodeDesc predicate = operatorDesc.getFilterExpr();
    IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx;
    ParseContext pctx = context.getParseContext();
    LOG.info("Processing predicate for index optimization");
    if (predicate == null) {
        LOG.info("null predicate pushed down");
        return null;
    }
    LOG.info(predicate.getExprString());
    // check if we have tsToIndices on all partitions in this table scan
    Set<Partition> queryPartitions;
    try {
        queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes);
        if (queryPartitions == null) {
            // partitions not covered
            return null;
        }
    } catch (HiveException e) {
        LOG.error("Fatal Error: problem accessing metastore", e);
        throw new SemanticException(e);
    }
    // we can only process MapReduce tasks to check input size
    if (!context.getCurrentTask().isMapRedTask()) {
        return null;
    }
    MapRedTask currentTask = (MapRedTask) context.getCurrentTask();
    // get potential reentrant index queries from each index
    Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>();
    // make sure we have an index on the table being scanned
    TableDesc tblDesc = operator.getTableDesc();
    Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>();
    for (Index indexOnTable : indexes) {
        if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) {
            List<Index> newType = new ArrayList<Index>();
            newType.add(indexOnTable);
            indexesByType.put(indexOnTable.getIndexHandlerClass(), newType);
        } else {
            indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable);
        }
    }
    // choose index type with most tsToIndices of the same type on the table
    // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
    List<Index> bestIndexes = indexesByType.values().iterator().next();
    for (List<Index> indexTypes : indexesByType.values()) {
        if (bestIndexes.size() < indexTypes.size()) {
            bestIndexes = indexTypes;
        }
    }
    // rewrite index queries for the chosen index type
    HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext();
    tmpQueryContext.setQueryPartitions(queryPartitions);
    rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext);
    List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks();
    if (indexTasks != null && indexTasks.size() > 0) {
        queryContexts.put(bestIndexes.get(0), tmpQueryContext);
    }
    // choose an index rewrite to use
    if (queryContexts.size() > 0) {
        // TODO HIVE-2130 This would be a good place for some sort of cost based choice?
        Index chosenIndex = queryContexts.keySet().iterator().next();
        // modify the parse context to use indexing
        // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times
        HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex);
        // prepare the map reduce job to use indexing
        MapWork work = currentTask.getWork().getMapWork();
        work.setInputformat(queryContext.getIndexInputFormat());
        work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile());
        // modify inputs based on index query
        Set<ReadEntity> inputs = pctx.getSemanticInputs();
        inputs.addAll(queryContext.getAdditionalSemanticInputs());
        List<Task<?>> chosenRewrite = queryContext.getQueryTasks();
        // add dependencies so index query runs first
        insertIndexQuery(pctx, context, chosenRewrite);
    }
    return null;
}
Also used : HiveIndexQueryContext(org.apache.hadoop.hive.ql.index.HiveIndexQueryContext) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HashMap(java.util.HashMap) Node(org.apache.hadoop.hive.ql.lib.Node) ArrayList(java.util.ArrayList) Index(org.apache.hadoop.hive.metastore.api.Index) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 25 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class IndexWhereTaskDispatcher method dispatch.

@Override
public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException {
    Task<? extends Serializable> task = (Task<? extends Serializable>) nd;
    ParseContext pctx = physicalContext.getParseContext();
    // create the regex's so the walker can recognize our WHERE queries
    Map<Rule, NodeProcessor> operatorRules = createOperatorRules(pctx);
    // check for no indexes on any table
    if (operatorRules == null) {
        return null;
    }
    // create context so the walker can carry the current task with it.
    IndexWhereProcCtx indexWhereOptimizeCtx = new IndexWhereProcCtx(task, pctx);
    // create the dispatcher, which fires the processor according to the rule that
    // best matches
    Dispatcher dispatcher = new DefaultRuleDispatcher(getDefaultProcessor(), operatorRules, indexWhereOptimizeCtx);
    // walk the mapper operator(not task) tree for each specific task
    GraphWalker ogw = new DefaultGraphWalker(dispatcher);
    ArrayList<Node> topNodes = new ArrayList<Node>();
    if (task.getWork() instanceof MapredWork) {
        topNodes.addAll(((MapredWork) task.getWork()).getMapWork().getAliasToWork().values());
    } else {
        return null;
    }
    ogw.startWalking(topNodes, null);
    return null;
}
Also used : Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) Node(org.apache.hadoop.hive.ql.lib.Node) ArrayList(java.util.ArrayList) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Rule(org.apache.hadoop.hive.ql.lib.Rule) DefaultGraphWalker(org.apache.hadoop.hive.ql.lib.DefaultGraphWalker) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker)

Aggregations

ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)35 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)15 ArrayList (java.util.ArrayList)14 Path (org.apache.hadoop.fs.Path)12 Context (org.apache.hadoop.hive.ql.Context)12 Node (org.apache.hadoop.hive.ql.lib.Node)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)8 LinkedHashMap (java.util.LinkedHashMap)7 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)7 Task (org.apache.hadoop.hive.ql.exec.Task)7 DefaultRuleDispatcher (org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher)7 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)7 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)7 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)7 SemanticDispatcher (org.apache.hadoop.hive.ql.lib.SemanticDispatcher)6 SemanticGraphWalker (org.apache.hadoop.hive.ql.lib.SemanticGraphWalker)6 Partition (org.apache.hadoop.hive.ql.metadata.Partition)6 Table (org.apache.hadoop.hive.ql.metadata.Table)6 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)6 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)6