use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class MapReduceCompiler method generateTaskTree.
@Override
protected void generateTaskTree(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException {
// generate map reduce plans
ParseContext tempParseContext = getParseContext(pCtx, rootTasks);
GenMRProcContext procCtx = new GenMRProcContext(conf, // Must be deterministic order map for consistent q-test output across Java versions
new LinkedHashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>>(), tempParseContext, mvTask, rootTasks, new LinkedHashMap<Operator<? extends OperatorDesc>, GenMapRedCtx>(), inputs, outputs);
// create a walker which walks the tree in a DFS manner while maintaining
// the operator stack.
// The dispatcher generates the plan from the operator tree
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp(new String("R1"), TableScanOperator.getOperatorName() + "%"), new GenMRTableScan1());
opRules.put(new RuleRegExp(new String("R2"), TableScanOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"), new GenMRRedSink1());
opRules.put(new RuleRegExp(new String("R3"), ReduceSinkOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"), new GenMRRedSink2());
opRules.put(new RuleRegExp(new String("R4"), FileSinkOperator.getOperatorName() + "%"), new GenMRFileSink1());
opRules.put(new RuleRegExp(new String("R5"), UnionOperator.getOperatorName() + "%"), new GenMRUnion1());
opRules.put(new RuleRegExp(new String("R6"), UnionOperator.getOperatorName() + "%.*" + ReduceSinkOperator.getOperatorName() + "%"), new GenMRRedSink3());
opRules.put(new RuleRegExp(new String("R7"), MapJoinOperator.getOperatorName() + "%"), MapJoinFactory.getTableScanMapJoin());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(new GenMROperator(), opRules, procCtx);
GraphWalker ogw = new GenMapRedWalker(disp);
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(pCtx.getTopOps().values());
ogw.startWalking(topNodes, null);
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class MapReduceCompiler method setInputFormat.
// loop over all the tasks recursively
@Override
protected void setInputFormat(Task<? extends Serializable> task) {
if (task instanceof ExecDriver) {
MapWork work = ((MapredWork) task.getWork()).getMapWork();
HashMap<String, Operator<? extends OperatorDesc>> opMap = work.getAliasToWork();
if (!opMap.isEmpty()) {
for (Operator<? extends OperatorDesc> op : opMap.values()) {
setInputFormat(work, op);
}
}
} else if (task instanceof ConditionalTask) {
List<Task<? extends Serializable>> listTasks = ((ConditionalTask) task).getListTasks();
for (Task<? extends Serializable> tsk : listTasks) {
setInputFormat(tsk);
}
}
if (task.getChildTasks() != null) {
for (Task<? extends Serializable> childTask : task.getChildTasks()) {
setInputFormat(childTask);
}
}
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class QueryPlan method populateQueryPlan.
/**
* Populate api.QueryPlan from exec structures. This includes constructing the
* dependency graphs of stages and operators.
*
* @throws IOException
*/
private void populateQueryPlan() throws IOException {
query.setStageGraph(new org.apache.hadoop.hive.ql.plan.api.Graph());
query.getStageGraph().setNodeType(NodeType.STAGE);
Queue<Task<? extends Serializable>> tasksToVisit = new LinkedList<Task<? extends Serializable>>();
Set<Task<? extends Serializable>> tasksVisited = new HashSet<Task<? extends Serializable>>();
tasksToVisit.addAll(rootTasks);
while (tasksToVisit.size() != 0) {
Task<? extends Serializable> task = tasksToVisit.remove();
tasksVisited.add(task);
// populate stage
org.apache.hadoop.hive.ql.plan.api.Stage stage = new org.apache.hadoop.hive.ql.plan.api.Stage();
stage.setStageId(task.getId());
stage.setStageType(task.getType());
query.addToStageList(stage);
if (task instanceof ExecDriver) {
// populate map task
ExecDriver mrTask = (ExecDriver) task;
org.apache.hadoop.hive.ql.plan.api.Task mapTask = new org.apache.hadoop.hive.ql.plan.api.Task();
mapTask.setTaskId(stage.getStageId() + "_MAP");
mapTask.setTaskType(TaskType.MAP);
stage.addToTaskList(mapTask);
populateOperatorGraph(mapTask, mrTask.getWork().getMapWork().getAliasToWork().values());
// populate reduce task
if (mrTask.hasReduce()) {
org.apache.hadoop.hive.ql.plan.api.Task reduceTask = new org.apache.hadoop.hive.ql.plan.api.Task();
reduceTask.setTaskId(stage.getStageId() + "_REDUCE");
reduceTask.setTaskType(TaskType.REDUCE);
stage.addToTaskList(reduceTask);
Collection<Operator<? extends OperatorDesc>> reducerTopOps = new ArrayList<Operator<? extends OperatorDesc>>();
reducerTopOps.add(mrTask.getWork().getReduceWork().getReducer());
populateOperatorGraph(reduceTask, reducerTopOps);
}
} else {
org.apache.hadoop.hive.ql.plan.api.Task otherTask = new org.apache.hadoop.hive.ql.plan.api.Task();
otherTask.setTaskId(stage.getStageId() + "_OTHER");
otherTask.setTaskType(TaskType.OTHER);
stage.addToTaskList(otherTask);
}
if (task instanceof ConditionalTask) {
org.apache.hadoop.hive.ql.plan.api.Adjacency listEntry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
listEntry.setAdjacencyType(AdjacencyType.DISJUNCTIVE);
listEntry.setNode(task.getId());
ConditionalTask t = (ConditionalTask) task;
for (Task<? extends Serializable> listTask : t.getListTasks()) {
if (t.getChildTasks() != null) {
org.apache.hadoop.hive.ql.plan.api.Adjacency childEntry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
childEntry.setAdjacencyType(AdjacencyType.DISJUNCTIVE);
childEntry.setNode(listTask.getId());
// done processing the task
for (Task<? extends Serializable> childTask : t.getChildTasks()) {
childEntry.addToChildren(childTask.getId());
if (!tasksVisited.contains(childTask)) {
tasksToVisit.add(childTask);
}
}
query.getStageGraph().addToAdjacencyList(childEntry);
}
listEntry.addToChildren(listTask.getId());
if (!tasksVisited.contains(listTask)) {
tasksToVisit.add(listTask);
}
}
query.getStageGraph().addToAdjacencyList(listEntry);
} else if (task.getChildTasks() != null) {
org.apache.hadoop.hive.ql.plan.api.Adjacency entry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
entry.setAdjacencyType(AdjacencyType.CONJUNCTIVE);
entry.setNode(task.getId());
// done processing the task
for (Task<? extends Serializable> childTask : task.getChildTasks()) {
entry.addToChildren(childTask.getId());
if (!tasksVisited.contains(childTask)) {
tasksToVisit.add(childTask);
}
}
query.getStageGraph().addToAdjacencyList(entry);
}
}
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class MapOperator method setChildren.
public void setChildren(Configuration hconf) throws Exception {
List<Operator<? extends OperatorDesc>> children = new ArrayList<Operator<? extends OperatorDesc>>();
Map<String, Configuration> tableNameToConf = cloneConfsForNestedColPruning(hconf);
Map<TableDesc, StructObjectInspector> convertedOI = getConvertedOI(tableNameToConf);
for (Map.Entry<Path, ArrayList<String>> entry : conf.getPathToAliases().entrySet()) {
Path onefile = entry.getKey();
List<String> aliases = entry.getValue();
PartitionDesc partDesc = conf.getPathToPartitionInfo().get(onefile);
TableDesc tableDesc = partDesc.getTableDesc();
Configuration newConf = tableNameToConf.get(tableDesc.getTableName());
for (String alias : aliases) {
Operator<? extends OperatorDesc> op = conf.getAliasToWork().get(alias);
if (isLogDebugEnabled) {
LOG.debug("Adding alias " + alias + " to work list for file " + onefile);
}
Map<Operator<?>, MapOpCtx> contexts = opCtxMap.get(onefile.toString());
if (contexts == null) {
opCtxMap.put(onefile.toString(), contexts = new LinkedHashMap<Operator<?>, MapOpCtx>());
}
if (contexts.containsKey(op)) {
continue;
}
MapOpCtx context = new MapOpCtx(alias, op, partDesc);
StructObjectInspector tableRowOI = convertedOI.get(partDesc.getTableDesc());
contexts.put(op, initObjectInspector(newConf, context, tableRowOI));
if (children.contains(op) == false) {
op.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(1));
op.getParentOperators().add(this);
children.add(op);
}
}
}
initOperatorContext(children);
// we found all the operators that we are supposed to process.
setChildOperators(children);
}
use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.
the class CombineHiveInputFormat method getCombineSplits.
/**
* Create Hive splits based on CombineFileSplit.
*/
private InputSplit[] getCombineSplits(JobConf job, int numSplits, Map<Path, PartitionDesc> pathToPartitionInfo) throws IOException {
init(job);
Map<Path, ArrayList<String>> pathToAliases = mrwork.getPathToAliases();
Map<String, Operator<? extends OperatorDesc>> aliasToWork = mrwork.getAliasToWork();
CombineFileInputFormatShim combine = ShimLoader.getHadoopShims().getCombineFileInputFormat();
InputSplit[] splits = null;
if (combine == null) {
splits = super.getSplits(job, numSplits);
return splits;
}
if (combine.getInputPathsShim(job).length == 0) {
throw new IOException("No input paths specified in job");
}
ArrayList<InputSplit> result = new ArrayList<InputSplit>();
// combine splits only from same tables and same partitions. Do not combine splits from multiple
// tables or multiple partitions.
Path[] paths = StringInternUtils.internUriStringsInPathArray(combine.getInputPathsShim(job));
List<Path> inpDirs = new ArrayList<Path>();
List<Path> inpFiles = new ArrayList<Path>();
Map<CombinePathInputFormat, CombineFilter> poolMap = new HashMap<CombinePathInputFormat, CombineFilter>();
Set<Path> poolSet = new HashSet<Path>();
for (Path path : paths) {
PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, path, IOPrepareCache.get().allocatePartitionDescMap());
TableDesc tableDesc = part.getTableDesc();
if ((tableDesc != null) && tableDesc.isNonNative()) {
return super.getSplits(job, numSplits);
}
// Use HiveInputFormat if any of the paths is not splittable
Class inputFormatClass = part.getInputFileFormatClass();
String inputFormatClassName = inputFormatClass.getName();
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
String deserializerClassName = null;
try {
deserializerClassName = part.getDeserializer(job).getClass().getName();
} catch (Exception e) {
// ignore
}
FileSystem inpFs = path.getFileSystem(job);
//don't combine if inputformat is a SymlinkTextInputFormat
if (inputFormat instanceof SymlinkTextInputFormat) {
splits = super.getSplits(job, numSplits);
return splits;
}
Path filterPath = path;
// Does a pool exist for this path already
CombineFilter f = null;
List<Operator<? extends OperatorDesc>> opList = null;
if (!mrwork.isMapperCannotSpanPartns()) {
//if mapper can span partitions, make sure a splits does not contain multiple
// opList + inputFormatClassName + deserializerClassName combination
// This is done using the Map of CombinePathInputFormat to PathFilter
opList = HiveFileFormatUtils.doGetWorksFromPath(pathToAliases, aliasToWork, filterPath);
CombinePathInputFormat combinePathInputFormat = new CombinePathInputFormat(opList, inputFormatClassName, deserializerClassName);
f = poolMap.get(combinePathInputFormat);
if (f == null) {
f = new CombineFilter(filterPath);
LOG.info("CombineHiveInputSplit creating pool for " + path + "; using filter path " + filterPath);
combine.createPool(job, f);
poolMap.put(combinePathInputFormat, f);
} else {
LOG.info("CombineHiveInputSplit: pool is already created for " + path + "; using filter path " + filterPath);
f.addPath(filterPath);
}
} else {
// but won't cross multiple partitions if the user has asked so.
if (!path.getFileSystem(job).getFileStatus(path).isDir()) {
// path is not directory
filterPath = path.getParent();
inpFiles.add(path);
poolSet.add(filterPath);
} else {
inpDirs.add(path);
}
}
}
// Processing directories
List<CombineFileSplit> iss = new ArrayList<CombineFileSplit>();
if (!mrwork.isMapperCannotSpanPartns()) {
//mapper can span partitions
//combine into as few as one split, subject to the PathFilters set
// using combine.createPool.
iss = Arrays.asList(combine.getSplits(job, 1));
} else {
for (Path path : inpDirs) {
processPaths(job, combine, iss, path);
}
if (inpFiles.size() > 0) {
// Processing files
for (Path filterPath : poolSet) {
combine.createPool(job, new CombineFilter(filterPath));
}
processPaths(job, combine, iss, inpFiles.toArray(new Path[0]));
}
}
if (mrwork.getNameToSplitSample() != null && !mrwork.getNameToSplitSample().isEmpty()) {
iss = sampleSplits(iss);
}
for (CombineFileSplit is : iss) {
CombineHiveInputSplit csplit = new CombineHiveInputSplit(job, is, pathToPartitionInfo);
result.add(csplit);
}
LOG.info("number of splits " + result.size());
return result.toArray(new CombineHiveInputSplit[result.size()]);
}
Aggregations