Search in sources :

Example 11 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class SparkSkewJoinProcFactory method splitTask.

/**
 * If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
 */
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
    SparkWork currentWork = currentTask.getWork();
    Set<Operator<?>> reduceSinkSet = OperatorUtils.getOp(reduceWork, ReduceSinkOperator.class);
    if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
        ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
        BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
        SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
        // disconnect the reduce work from its child. this should produce two isolated sub graphs
        currentWork.disconnect(reduceWork, childWork);
        // move works following the current reduce work into a new spark work
        SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
        newWork.add(childWork);
        copyWorkGraph(currentWork, newWork, childWork);
        // remove them from current spark work
        for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
            currentWork.remove(baseWork);
            currentWork.getCloneToWork().remove(baseWork);
        }
        // create TS to read intermediate data
        Context baseCtx = parseContext.getContext();
        Path taskTmpDir = baseCtx.getMRTmpPath();
        Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
        TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
        // this will insert FS and TS between the RS and its parent
        TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
        // create new MapWork
        MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
        mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
        newWork.add(mapWork);
        newWork.connect(mapWork, childWork, originEdge);
        // setup the new map work
        String streamDesc = taskTmpDir.toUri().toString();
        if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
            Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
            String id = null;
            if (childReducer instanceof JoinOperator) {
                if (parseContext.getJoinOps().contains(childReducer)) {
                    id = ((JoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof MapJoinOperator) {
                if (parseContext.getMapJoinOps().contains(childReducer)) {
                    id = ((MapJoinOperator) childReducer).getConf().getId();
                }
            } else if (childReducer instanceof SMBMapJoinOperator) {
                if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
                    id = ((SMBMapJoinOperator) childReducer).getConf().getId();
                }
            }
            if (id != null) {
                streamDesc = id + ":$INTNAME";
            } else {
                streamDesc = "$INTNAME";
            }
            String origStreamDesc = streamDesc;
            int pos = 0;
            while (mapWork.getAliasToWork().get(streamDesc) != null) {
                streamDesc = origStreamDesc.concat(String.valueOf(++pos));
            }
        }
        GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
        // insert the new task between current task and its child
        @SuppressWarnings("unchecked") Task<?> newTask = TaskFactory.get(newWork);
        List<Task<?>> childTasks = currentTask.getChildTasks();
        // must have at most one child
        if (childTasks != null && childTasks.size() > 0) {
            Task<?> childTask = childTasks.get(0);
            currentTask.removeDependentTask(childTask);
            newTask.addDependentTask(childTask);
        }
        currentTask.addDependentTask(newTask);
        newTask.setFetchSource(currentTask.isFetchSource());
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Context(org.apache.hadoop.hive.ql.Context) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Path(org.apache.hadoop.fs.Path) CommonJoinOperator(org.apache.hadoop.hive.ql.exec.CommonJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) Task(org.apache.hadoop.hive.ql.exec.Task) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 12 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class GenSparkUtils method processFileSink.

public void processFileSink(GenSparkProcContext context, FileSinkOperator fileSink) throws SemanticException {
    ParseContext parseContext = context.parseContext;
    // is INSERT OVERWRITE TABLE
    boolean isInsertTable = GenMapRedUtils.isInsertInto(parseContext, fileSink);
    HiveConf hconf = parseContext.getConf();
    boolean chDir = GenMapRedUtils.isMergeRequired(context.moveTask, hconf, fileSink, context.currentTask, isInsertTable);
    // Set stats config for FileSinkOperators which are cloned from the fileSink
    List<FileSinkOperator> fileSinkList = context.fileSinkMap.get(fileSink);
    if (fileSinkList != null) {
        for (FileSinkOperator fsOp : fileSinkList) {
            fsOp.getConf().setGatherStats(fileSink.getConf().isGatherStats());
            fsOp.getConf().setStatsReliable(fileSink.getConf().isStatsReliable());
        }
    }
    Path finalName = createMoveTask(context.currentTask, chDir, fileSink, parseContext, context.moveTask, hconf, context.dependencyTask);
    if (chDir) {
        // Merge the files in the destination table/partitions by creating Map-only merge job
        // If underlying data is RCFile a RCFileBlockMerge task would be created.
        LOG.info("using CombineHiveInputformat for the merge job");
        GenMapRedUtils.createMRWorkForMergingFiles(fileSink, finalName, context.dependencyTask, context.moveTask, hconf, context.currentTask, parseContext.getQueryState().getLineageState());
    }
    FetchTask fetchTask = parseContext.getFetchTask();
    if (fetchTask != null && context.currentTask.getNumChild() == 0) {
        if (fetchTask.isFetchFrom(fileSink.getConf())) {
            context.currentTask.setFetchSource(true);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) HiveConf(org.apache.hadoop.hive.conf.HiveConf) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask)

Example 13 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class SparkCompiler method runDynPartitionSortOptimizations.

private void runDynPartitionSortOptimizations(OptimizeSparkProcContext procCtx) throws SemanticException {
    // run Sorted dynamic partition optimization
    HiveConf hConf = procCtx.getConf();
    ParseContext parseContext = procCtx.getParseContext();
    runDynPartitionSortOptimizations(parseContext, hConf);
}
Also used : ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) HiveConf(org.apache.hadoop.hive.conf.HiveConf)

Example 14 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class CommandAuthorizerV1 method getTablePartitionUsedColumns.

private static void getTablePartitionUsedColumns(HiveOperation op, BaseSemanticAnalyzer sem, Map<Table, List<String>> tab2Cols, Map<Partition, List<String>> part2Cols, Map<String, Boolean> tableUsePartLevelAuth) throws HiveException {
    // table to columns mapping (tab2Cols)
    if (op.equals(HiveOperation.CREATETABLE_AS_SELECT) || op.equals(HiveOperation.QUERY)) {
        ParseContext parseCtx = sem.getParseContext();
        for (Map.Entry<String, TableScanOperator> topOpMap : parseCtx.getTopOps().entrySet()) {
            TableScanOperator tableScanOp = topOpMap.getValue();
            if (!tableScanOp.isInsideView()) {
                Table tbl = tableScanOp.getConf().getTableMetadata();
                List<String> cols = new ArrayList<String>();
                for (int id : tableScanOp.getNeededColumnIDs()) {
                    cols.add(tbl.getCols().get(id).getName());
                }
                // if it's null then the partition probably doesn't exist so let's use table permission
                if (tbl.isPartitioned() && Boolean.TRUE.equals(tableUsePartLevelAuth.get(tbl.getTableName()))) {
                    String aliasId = topOpMap.getKey();
                    PrunedPartitionList partsList = PartitionPruner.prune(tableScanOp, parseCtx, aliasId);
                    Set<Partition> parts = partsList.getPartitions();
                    for (Partition part : parts) {
                        List<String> existingCols = part2Cols.get(part);
                        if (existingCols == null) {
                            existingCols = new ArrayList<String>();
                        }
                        existingCols.addAll(cols);
                        part2Cols.put(part, existingCols);
                    }
                } else {
                    List<String> existingCols = tab2Cols.get(tbl);
                    if (existingCols == null) {
                        existingCols = new ArrayList<String>();
                    }
                    existingCols.addAll(cols);
                    tab2Cols.put(tbl, existingCols);
                }
            }
        }
    }
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) ArrayList(java.util.ArrayList) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) HashMap(java.util.HashMap) Map(java.util.Map)

Example 15 with ParseContext

use of org.apache.hadoop.hive.ql.parse.ParseContext in project hive by apache.

the class TestExplainTask method explainToString.

private <K, V> String explainToString(Map<K, V> explainMap) throws Exception {
    ExplainWork work = new ExplainWork();
    ParseContext pCtx = new ParseContext();
    HashMap<String, TableScanOperator> topOps = new HashMap<>();
    TableScanOperator scanOp = new DummyOperator(new DummyExplainDesc<K, V>(explainMap));
    topOps.put("sample", scanOp);
    pCtx.setTopOps(topOps);
    work.setParseContext(pCtx);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    work.setConfig(new ExplainConfiguration());
    ExplainTask newExplainTask = new ExplainTask();
    newExplainTask.queryState = uut.queryState;
    newExplainTask.getJSONLogicalPlan(new PrintStream(baos), work);
    baos.close();
    return baos.toString();
}
Also used : PrintStream(java.io.PrintStream) ExplainConfiguration(org.apache.hadoop.hive.ql.parse.ExplainConfiguration) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext)

Aggregations

ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)35 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)15 ArrayList (java.util.ArrayList)14 Path (org.apache.hadoop.fs.Path)12 Context (org.apache.hadoop.hive.ql.Context)12 Node (org.apache.hadoop.hive.ql.lib.Node)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)8 LinkedHashMap (java.util.LinkedHashMap)7 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)7 Task (org.apache.hadoop.hive.ql.exec.Task)7 DefaultRuleDispatcher (org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher)7 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)7 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)7 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)7 SemanticDispatcher (org.apache.hadoop.hive.ql.lib.SemanticDispatcher)6 SemanticGraphWalker (org.apache.hadoop.hive.ql.lib.SemanticGraphWalker)6 Partition (org.apache.hadoop.hive.ql.metadata.Partition)6 Table (org.apache.hadoop.hive.ql.metadata.Table)6 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)6 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)6