Search in sources :

Example 21 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class MapReduceCompiler method generateTaskTree.

@Override
protected void generateTaskTree(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException {
    // generate map reduce plans
    ParseContext tempParseContext = getParseContext(pCtx, rootTasks);
    GenMRProcContext procCtx = new GenMRProcContext(conf, // Must be deterministic order map for consistent q-test output across Java versions
    new LinkedHashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>>(), tempParseContext, mvTask, rootTasks, new LinkedHashMap<Operator<? extends OperatorDesc>, GenMapRedCtx>(), inputs, outputs);
    // create a walker which walks the tree in a DFS manner while maintaining
    // the operator stack.
    // The dispatcher generates the plan from the operator tree
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new RuleRegExp(new String("R1"), TableScanOperator.getOperatorName() + "%"), new GenMRTableScan1());
    opRules.put(new RuleRegExp(new String("R2"), TableScanOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"), new GenMRRedSink1());
    opRules.put(new RuleRegExp(new String("R3"), ReduceSinkOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"), new GenMRRedSink2());
    opRules.put(new RuleRegExp(new String("R4"), FileSinkOperator.getOperatorName() + "%"), new GenMRFileSink1());
    opRules.put(new RuleRegExp(new String("R5"), UnionOperator.getOperatorName() + "%"), new GenMRUnion1());
    opRules.put(new RuleRegExp(new String("R6"), UnionOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"), new GenMRRedSink3());
    opRules.put(new RuleRegExp(new String("R7"), MapJoinOperator.getOperatorName() + "%"), MapJoinFactory.getTableScanMapJoin());
    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(new GenMROperator(), opRules, procCtx);
    GraphWalker ogw = new GenMapRedWalker(disp);
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pCtx.getTopOps().values());
    ogw.startWalking(topNodes, null);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) GenMROperator(org.apache.hadoop.hive.ql.optimizer.GenMROperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) Node(org.apache.hadoop.hive.ql.lib.Node) ArrayList(java.util.ArrayList) Dispatcher(org.apache.hadoop.hive.ql.lib.Dispatcher) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) LinkedHashMap(java.util.LinkedHashMap) GenMRTableScan1(org.apache.hadoop.hive.ql.optimizer.GenMRTableScan1) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) GenMRProcContext(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext) GraphWalker(org.apache.hadoop.hive.ql.lib.GraphWalker) GenMROperator(org.apache.hadoop.hive.ql.optimizer.GenMROperator) NodeProcessor(org.apache.hadoop.hive.ql.lib.NodeProcessor) GenMRUnion1(org.apache.hadoop.hive.ql.optimizer.GenMRUnion1) GenMRFileSink1(org.apache.hadoop.hive.ql.optimizer.GenMRFileSink1) DefaultRuleDispatcher(org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher) RuleRegExp(org.apache.hadoop.hive.ql.lib.RuleRegExp) GenMRRedSink2(org.apache.hadoop.hive.ql.optimizer.GenMRRedSink2) GenMRRedSink1(org.apache.hadoop.hive.ql.optimizer.GenMRRedSink1) GenMRRedSink3(org.apache.hadoop.hive.ql.optimizer.GenMRRedSink3) Rule(org.apache.hadoop.hive.ql.lib.Rule) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 22 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class MapReduceCompiler method setInputFormat.

// loop over all the tasks recursively
@Override
protected void setInputFormat(Task<? extends Serializable> task) {
    if (task instanceof ExecDriver) {
        MapWork work = ((MapredWork) task.getWork()).getMapWork();
        HashMap<String, Operator<? extends OperatorDesc>> opMap = work.getAliasToWork();
        if (!opMap.isEmpty()) {
            for (Operator<? extends OperatorDesc> op : opMap.values()) {
                setInputFormat(work, op);
            }
        }
    } else if (task instanceof ConditionalTask) {
        List<Task<? extends Serializable>> listTasks = ((ConditionalTask) task).getListTasks();
        for (Task<? extends Serializable> tsk : listTasks) {
            setInputFormat(tsk);
        }
    }
    if (task.getChildTasks() != null) {
        for (Task<? extends Serializable> childTask : task.getChildTasks()) {
            setInputFormat(childTask);
        }
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) GenMROperator(org.apache.hadoop.hive.ql.optimizer.GenMROperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ExecDriver(org.apache.hadoop.hive.ql.exec.mr.ExecDriver) List(java.util.List) ArrayList(java.util.ArrayList) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 23 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class QueryPlan method populateQueryPlan.

/**
   * Populate api.QueryPlan from exec structures. This includes constructing the
   * dependency graphs of stages and operators.
   *
   * @throws IOException
   */
private void populateQueryPlan() throws IOException {
    query.setStageGraph(new org.apache.hadoop.hive.ql.plan.api.Graph());
    query.getStageGraph().setNodeType(NodeType.STAGE);
    Queue<Task<? extends Serializable>> tasksToVisit = new LinkedList<Task<? extends Serializable>>();
    Set<Task<? extends Serializable>> tasksVisited = new HashSet<Task<? extends Serializable>>();
    tasksToVisit.addAll(rootTasks);
    while (tasksToVisit.size() != 0) {
        Task<? extends Serializable> task = tasksToVisit.remove();
        tasksVisited.add(task);
        // populate stage
        org.apache.hadoop.hive.ql.plan.api.Stage stage = new org.apache.hadoop.hive.ql.plan.api.Stage();
        stage.setStageId(task.getId());
        stage.setStageType(task.getType());
        query.addToStageList(stage);
        if (task instanceof ExecDriver) {
            // populate map task
            ExecDriver mrTask = (ExecDriver) task;
            org.apache.hadoop.hive.ql.plan.api.Task mapTask = new org.apache.hadoop.hive.ql.plan.api.Task();
            mapTask.setTaskId(stage.getStageId() + "_MAP");
            mapTask.setTaskType(TaskType.MAP);
            stage.addToTaskList(mapTask);
            populateOperatorGraph(mapTask, mrTask.getWork().getMapWork().getAliasToWork().values());
            // populate reduce task
            if (mrTask.hasReduce()) {
                org.apache.hadoop.hive.ql.plan.api.Task reduceTask = new org.apache.hadoop.hive.ql.plan.api.Task();
                reduceTask.setTaskId(stage.getStageId() + "_REDUCE");
                reduceTask.setTaskType(TaskType.REDUCE);
                stage.addToTaskList(reduceTask);
                Collection<Operator<? extends OperatorDesc>> reducerTopOps = new ArrayList<Operator<? extends OperatorDesc>>();
                reducerTopOps.add(mrTask.getWork().getReduceWork().getReducer());
                populateOperatorGraph(reduceTask, reducerTopOps);
            }
        } else {
            org.apache.hadoop.hive.ql.plan.api.Task otherTask = new org.apache.hadoop.hive.ql.plan.api.Task();
            otherTask.setTaskId(stage.getStageId() + "_OTHER");
            otherTask.setTaskType(TaskType.OTHER);
            stage.addToTaskList(otherTask);
        }
        if (task instanceof ConditionalTask) {
            org.apache.hadoop.hive.ql.plan.api.Adjacency listEntry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
            listEntry.setAdjacencyType(AdjacencyType.DISJUNCTIVE);
            listEntry.setNode(task.getId());
            ConditionalTask t = (ConditionalTask) task;
            for (Task<? extends Serializable> listTask : t.getListTasks()) {
                if (t.getChildTasks() != null) {
                    org.apache.hadoop.hive.ql.plan.api.Adjacency childEntry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
                    childEntry.setAdjacencyType(AdjacencyType.DISJUNCTIVE);
                    childEntry.setNode(listTask.getId());
                    // done processing the task
                    for (Task<? extends Serializable> childTask : t.getChildTasks()) {
                        childEntry.addToChildren(childTask.getId());
                        if (!tasksVisited.contains(childTask)) {
                            tasksToVisit.add(childTask);
                        }
                    }
                    query.getStageGraph().addToAdjacencyList(childEntry);
                }
                listEntry.addToChildren(listTask.getId());
                if (!tasksVisited.contains(listTask)) {
                    tasksToVisit.add(listTask);
                }
            }
            query.getStageGraph().addToAdjacencyList(listEntry);
        } else if (task.getChildTasks() != null) {
            org.apache.hadoop.hive.ql.plan.api.Adjacency entry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
            entry.setAdjacencyType(AdjacencyType.CONJUNCTIVE);
            entry.setNode(task.getId());
            // done processing the task
            for (Task<? extends Serializable> childTask : task.getChildTasks()) {
                entry.addToChildren(childTask.getId());
                if (!tasksVisited.contains(childTask)) {
                    tasksToVisit.add(childTask);
                }
            }
            query.getStageGraph().addToAdjacencyList(entry);
        }
    }
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) ExplainTask(org.apache.hadoop.hive.ql.exec.ExplainTask) Serializable(java.io.Serializable) ArrayList(java.util.ArrayList) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) HashSet(java.util.HashSet) LinkedList(java.util.LinkedList) ExecDriver(org.apache.hadoop.hive.ql.exec.mr.ExecDriver) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 24 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class MapOperator method setChildren.

public void setChildren(Configuration hconf) throws Exception {
    List<Operator<? extends OperatorDesc>> children = new ArrayList<Operator<? extends OperatorDesc>>();
    Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf);
    Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf);
    for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) {
        Path onefile = entry.getKey();
        List<String> aliases = entry.getValue();
        PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile);
        TableDesc tableDesc = partDesc.getTableDesc();
        Configuration newConf = tableNameToConf.get(tableDesc.getTableName());
        for (String alias : aliases) {
            Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias);
            if (isLogDebugEnabled) {
                LOG.debug("Adding alias " + alias + " to work list for file " + onefile);
            }
            Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(onefile.toString());
            if (contexts == null) {
                opCtxMap.put(onefile.toString(), contexts = new LinkedHashMap<Operator<?>, MapOpCtx>());
            }
            if (contexts.containsKey(op)) {
                continue;
            }
            MapOpCtx context = new MapOpCtx(alias, op, partDesc);
            StructObjectInspector tableRowOI = convertedOI.get(partDesc.getTableDesc());
            contexts.put(op, initObjectInspector(newConf, context, tableRowOI));
            if (children.contains(op) == false) {
                op.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(1));
                op.getParentOperators().add(this);
                children.add(op);
            }
        }
    }
    initOperatorContext(children);
    // we found all the operators that we are supposed to process.
    setChildOperators(children);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 25 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class CombineHiveInputFormat method getCombineSplits.

/**
   * Create Hive splits based on CombineFileSplit.
   */
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
    init(job);
    Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
    CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
    InputSplit[] splits = null;
    if (combine == null) {
        splits = super.getSplits(job, numSplits);
        return splits;
    }
    if (combine.getInputPathsShim(job).length == 0) {
        throw new IOException("No input paths specified in job");
    }
    ArrayList<InputSplit> result = new ArrayList<InputSplit>();
    // combine splits only from same tables and same partitions. Do not combine splits from multiple
    // tables or multiple partitions.
    Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
    List<Path> inpDirs = new ArrayList<Path>();
    List<Path> inpFiles = new ArrayList<Path>();
    Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
    Set<Path> poolSet = new HashSet<Path>();
    for (Path path : paths) {
        PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
        TableDesc tableDesc = part.getTableDesc();
        if ((tableDesc != null) && tableDesc.isNonNative()) {
            return super.getSplits(job, numSplits);
        }
        // Use HiveInputFormat if any of the paths is not splittable
        Class inputFormatClass = part.getInputFileFormatClass();
        String inputFormatClassName = inputFormatClass.getName();
        InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
        String deserializerClassName = null;
        try {
            deserializerClassName = part.getDeserializer(job).getClass().getName();
        } catch (Exception e) {
        // ignore
        }
        FileSystem inpFs = path.getFileSystem(job);
        //don't combine if inputformat is a SymlinkTextInputFormat
        if (inputFormat instanceof SymlinkTextInputFormat) {
            splits = super.getSplits(job, numSplits);
            return splits;
        }
        Path filterPath = path;
        // Does a pool exist for this path already
        CombineFilter f = null;
        List<Operator<? extends OperatorDesc>> opList = null;
        if (!mrwork.isMapperCannotSpanPartns()) {
            //if mapper can span partitions, make sure a splits does not contain multiple
            // opList + inputFormatClassName + deserializerClassName combination
            // This is done using the Map of CombinePathInputFormat to PathFilter
            opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
            CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
            f = poolMap.get(combinePathInputFormat);
            if (f == null) {
                f = new CombineFilter(filterPath);
                LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
                combine.createPool(job, f);
                poolMap.put(combinePathInputFormat, f);
            } else {
                LOG.info("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
                f.addPath(filterPath);
            }
        } else {
            // but won't cross multiple partitions if the user has asked so.
            if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
                // path is not directory
                filterPath = path.getParent();
                inpFiles.add(path);
                poolSet.add(filterPath);
            } else {
                inpDirs.add(path);
            }
        }
    }
    // Processing directories
    List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
    if (!mrwork.isMapperCannotSpanPartns()) {
        //mapper can span partitions
        //combine into as few as one split, subject to the PathFilters set
        // using combine.createPool.
        iss = Arrays.asList(combine.getSplits(job, 1));
    } else {
        for (Path path : inpDirs) {
            processPaths(job, combine, iss, path);
        }
        if (inpFiles.size() > 0) {
            // Processing files
            for (Path filterPath : poolSet) {
                combine.createPool(job, new CombineFilter(filterPath));
            }
            processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
        }
    }
    if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
        iss = sampleSplits(iss);
    }
    for (CombineFileSplit is : iss) {
        CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
        result.add(csplit);
    }
    LOG.info("number of splits " + result.size());
    return result.toArray(new CombineHiveInputSplit[result.size()]);
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapred.lib.CombineFileSplit) CombineFileInputFormatShim(org.apache.hadoop.hive.shims.HadoopShims.CombineFileInputFormatShim) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) InputFormat(org.apache.hadoop.mapred.InputFormat) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)78 Operator (org.apache.hadoop.hive.ql.exec.Operator)65 ArrayList (java.util.ArrayList)47 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)41 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)36 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)32 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)30 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)29 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)22 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)21 Path (org.apache.hadoop.fs.Path)20 LinkedHashMap (java.util.LinkedHashMap)18 HashMap (java.util.HashMap)16 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)16 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)16 Serializable (java.io.Serializable)15 Task (org.apache.hadoop.hive.ql.exec.Task)15 List (java.util.List)14 Map (java.util.Map)13 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)13