use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.
the class SparkCompiler method generateTaskTree.
/**
* TODO: need to turn on rules that's commented out and add more if necessary.
*/
@Override
protected void generateTaskTree(List<Task<? extends Serializable>> rootTasks, ParseContext pCtx, List<Task<MoveWork>> mvTask, Set<ReadEntity> inputs, Set<WriteEntity> outputs) throws SemanticException {
PERF_LOGGER.PerfLogBegin(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
GenSparkUtils utils = GenSparkUtils.getUtils();
utils.resetSequenceNumber();
ParseContext tempParseContext = getParseContext(pCtx, rootTasks);
GenSparkProcContext procCtx = new GenSparkProcContext(conf, tempParseContext, mvTask, rootTasks, inputs, outputs, pCtx.getTopOps());
// -------------------------------- First Pass ---------------------------------- //
// Identify SparkPartitionPruningSinkOperators, and break OP tree if necessary
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
opRules.put(new RuleRegExp("Clone OP tree for PartitionPruningSink", SparkPartitionPruningSinkOperator.getOperatorName() + "%"), new SplitOpTreeForDPP());
Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx);
GraphWalker ogw = new GenSparkWorkWalker(disp, procCtx);
List<Node> topNodes = new ArrayList<Node>();
topNodes.addAll(pCtx.getTopOps().values());
ogw.startWalking(topNodes, null);
// -------------------------------- Second Pass ---------------------------------- //
// Process operator tree in two steps: first we process the extra op trees generated
// in the first pass. Then we process the main op tree, and the result task will depend
// on the task generated in the first pass.
topNodes.clear();
topNodes.addAll(procCtx.topOps.values());
generateTaskTreeHelper(procCtx, topNodes);
// the partitions used.
if (!procCtx.clonedPruningTableScanSet.isEmpty()) {
SparkTask pruningTask = SparkUtilities.createSparkTask(conf);
SparkTask mainTask = procCtx.currentTask;
pruningTask.addDependentTask(procCtx.currentTask);
procCtx.rootTasks.remove(procCtx.currentTask);
procCtx.rootTasks.add(pruningTask);
procCtx.currentTask = pruningTask;
topNodes.clear();
topNodes.addAll(procCtx.clonedPruningTableScanSet);
generateTaskTreeHelper(procCtx, topNodes);
procCtx.currentTask = mainTask;
}
// we need to clone some operator plans and remove union operators still
for (BaseWork w : procCtx.workWithUnionOperators) {
GenSparkUtils.getUtils().removeUnionOperators(procCtx, w);
}
// we need to fill MapWork with 'local' work and bucket information for SMB Join.
GenSparkUtils.getUtils().annotateMapWork(procCtx);
// finally make sure the file sink operators are set up right
for (FileSinkOperator fileSink : procCtx.fileSinkSet) {
GenSparkUtils.getUtils().processFileSink(procCtx, fileSink);
}
// Process partition pruning sinks
for (Operator<?> prunerSink : procCtx.pruningSinkSet) {
utils.processPartitionPruningSink(procCtx, (SparkPartitionPruningSinkOperator) prunerSink);
}
PERF_LOGGER.PerfLogEnd(CLASS_NAME, PerfLogger.SPARK_GENERATE_TASK_TREE);
}
use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.
the class SparkSkewJoinProcFactory method splitTask.
/**
* If the join is not in a leaf ReduceWork, the spark task has to be split into 2 tasks.
*/
private static void splitTask(SparkTask currentTask, ReduceWork reduceWork, ParseContext parseContext) throws SemanticException {
SparkWork currentWork = currentTask.getWork();
Set<Operator<?>> reduceSinkSet = OperatorUtils.getOp(reduceWork, ReduceSinkOperator.class);
if (currentWork.getChildren(reduceWork).size() == 1 && canSplit(currentWork) && reduceSinkSet.size() == 1) {
ReduceSinkOperator reduceSink = (ReduceSinkOperator) reduceSinkSet.iterator().next();
BaseWork childWork = currentWork.getChildren(reduceWork).get(0);
SparkEdgeProperty originEdge = currentWork.getEdgeProperty(reduceWork, childWork);
// disconnect the reduce work from its child. this should produce two isolated sub graphs
currentWork.disconnect(reduceWork, childWork);
// move works following the current reduce work into a new spark work
SparkWork newWork = new SparkWork(parseContext.getConf().getVar(HiveConf.ConfVars.HIVEQUERYID));
newWork.add(childWork);
copyWorkGraph(currentWork, newWork, childWork);
// remove them from current spark work
for (BaseWork baseWork : newWork.getAllWorkUnsorted()) {
currentWork.remove(baseWork);
currentWork.getCloneToWork().remove(baseWork);
}
// create TS to read intermediate data
Context baseCtx = parseContext.getContext();
Path taskTmpDir = baseCtx.getMRTmpPath();
Operator<? extends OperatorDesc> rsParent = reduceSink.getParentOperators().get(0);
TableDesc tableDesc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(rsParent.getSchema(), "temporarycol"));
// this will insert FS and TS between the RS and its parent
TableScanOperator tableScanOp = GenMapRedUtils.createTemporaryFile(rsParent, reduceSink, taskTmpDir, tableDesc, parseContext);
// create new MapWork
MapWork mapWork = PlanUtils.getMapRedWork().getMapWork();
mapWork.setName("Map " + GenSparkUtils.getUtils().getNextSeqNumber());
newWork.add(mapWork);
newWork.connect(mapWork, childWork, originEdge);
// setup the new map work
String streamDesc = taskTmpDir.toUri().toString();
if (GenMapRedUtils.needsTagging((ReduceWork) childWork)) {
Operator<? extends OperatorDesc> childReducer = ((ReduceWork) childWork).getReducer();
String id = null;
if (childReducer instanceof JoinOperator) {
if (parseContext.getJoinOps().contains(childReducer)) {
id = ((JoinOperator) childReducer).getConf().getId();
}
} else if (childReducer instanceof MapJoinOperator) {
if (parseContext.getMapJoinOps().contains(childReducer)) {
id = ((MapJoinOperator) childReducer).getConf().getId();
}
} else if (childReducer instanceof SMBMapJoinOperator) {
if (parseContext.getSmbMapJoinOps().contains(childReducer)) {
id = ((SMBMapJoinOperator) childReducer).getConf().getId();
}
}
if (id != null) {
streamDesc = id + ":$INTNAME";
} else {
streamDesc = "$INTNAME";
}
String origStreamDesc = streamDesc;
int pos = 0;
while (mapWork.getAliasToWork().get(streamDesc) != null) {
streamDesc = origStreamDesc.concat(String.valueOf(++pos));
}
}
GenMapRedUtils.setTaskPlan(taskTmpDir, streamDesc, tableScanOp, mapWork, false, tableDesc);
// insert the new task between current task and its child
@SuppressWarnings("unchecked") Task<?> newTask = TaskFactory.get(newWork);
List<Task<?>> childTasks = currentTask.getChildTasks();
// must have at most one child
if (childTasks != null && childTasks.size() > 0) {
Task<?> childTask = childTasks.get(0);
currentTask.removeDependentTask(childTask);
newTask.addDependentTask(childTask);
}
currentTask.addDependentTask(newTask);
newTask.setFetchSource(currentTask.isFetchSource());
}
}
use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.
the class SparkSkewJoinProcFactory method supportRuntimeSkewJoin.
private static boolean supportRuntimeSkewJoin(JoinOperator joinOp, ReduceWork reduceWork, Task<?> currTask, HiveConf hiveConf) {
if (currTask instanceof SparkTask && GenMRSkewJoinProcessor.skewJoinEnabled(hiveConf, joinOp)) {
SparkWork sparkWork = ((SparkTask) currTask).getWork();
List<Task<?>> children = currTask.getChildTasks();
return !joinOp.getConf().isFixedAsSorted() && sparkWork.contains(reduceWork) && (children == null || children.size() <= 1) && OperatorUtils.getOp(reduceWork, CommonJoinOperator.class).size() == 1;
}
return false;
}
use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.
the class TestUtilities method testGetTasksHaveNoRepeats.
/**
* This test tests that Utilities.get*Tasks do not repeat themselves in the process
* of extracting tasks from a given set of root tasks when given DAGs that can have
* multiple paths, such as the case with Diamond-shaped DAGs common to replication.
*/
@Test
public void testGetTasksHaveNoRepeats() {
CountingWrappingTask mrTask = new CountingWrappingTask(new ExecDriver());
CountingWrappingTask tezTask = new CountingWrappingTask(new TezTask());
CountingWrappingTask sparkTask = new CountingWrappingTask(new SparkTask());
// First check - we should not have repeats in results
assertEquals("No repeated MRTasks from Utilities.getMRTasks", 1, Utilities.getMRTasks(getTestDiamondTaskGraph(mrTask)).size());
assertEquals("No repeated TezTasks from Utilities.getTezTasks", 1, Utilities.getTezTasks(getTestDiamondTaskGraph(tezTask)).size());
assertEquals("No repeated TezTasks from Utilities.getSparkTasks", 1, Utilities.getSparkTasks(getTestDiamondTaskGraph(sparkTask)).size());
// Second check - the tasks we looked for must not have been accessed more than
// once as a result of the traversal (note that we actually wind up accessing
// 2 times , because each visit counts twice, once to check for existence, and
// once to visit.
assertEquals("MRTasks should have been visited only once", 2, mrTask.getDepCallCount());
assertEquals("TezTasks should have been visited only once", 2, tezTask.getDepCallCount());
assertEquals("SparkTasks should have been visited only once", 2, sparkTask.getDepCallCount());
}
use of org.apache.hadoop.hive.ql.exec.spark.SparkTask in project hive by apache.
the class SparkCrossProductCheck method dispatch.
@Override
public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException {
@SuppressWarnings("unchecked") Task<?> currTask = (Task<?>) nd;
if (currTask instanceof SparkTask) {
SparkWork sparkWork = ((SparkTask) currTask).getWork();
checkShuffleJoin(sparkWork);
checkMapJoin((SparkTask) currTask);
} else if (currTask instanceof ConditionalTask) {
List<Task<?>> taskList = ((ConditionalTask) currTask).getListTasks();
for (Task<?> task : taskList) {
dispatch(task, stack, nodeOutputs);
}
}
return null;
}
Aggregations