use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class DagUtils method createVertex.
private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
if (mergeJoinWork.getMainWork() instanceof MapWork) {
List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
Vertex mergeVx = createVertex(conf, mapWork, appJarLr, additionalLr, fs, mrScratchDir, ctx, vertexType);
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set
// to false when using this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
for (int i = 0; i < mapWorkList.size(); i++) {
mapWork = (MapWork) (mapWorkList.get(i));
conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
conf.set(Utilities.INPUT_NAME, mapWork.getName());
LOG.info("Going through each work and adding MultiMRInput");
mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
}
VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
// the +1 to the size is because of the main work.
CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1);
DataOutputBuffer dob = new DataOutputBuffer();
vertexConf.write(dob);
byte[] userPayload = dob.getData();
desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
mergeVx.setVertexManagerPlugin(desc);
return mergeVx;
} else {
Vertex mergeVx = createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), appJarLr, additionalLr, fs, mrScratchDir, ctx);
return mergeVx;
}
}
use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class SplitSparkWorkResolver method splitSparkWork.
private void splitSparkWork(SparkWork sparkWork) {
Queue<BaseWork> queue = new LinkedList<BaseWork>();
Set<BaseWork> visited = new HashSet<BaseWork>();
queue.addAll(sparkWork.getRoots());
while (!queue.isEmpty()) {
BaseWork work = queue.poll();
if (!visited.add(work)) {
continue;
}
List<BaseWork> childWorks = sparkWork.getChildren(work);
// First, add all children of this work into queue, to be processed later.
for (BaseWork w : childWorks) {
queue.add(w);
}
// Second, check if this work has multiple reduceSinks. If so, do split.
splitBaseWork(sparkWork, work, childWorks);
}
}
use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class SplitSparkWorkResolver method splitBaseWork.
// Split work into multiple branches, one for each childWork in childWorks.
// It also set up the connection between each parent work and child work.
private void splitBaseWork(SparkWork sparkWork, BaseWork parentWork, List<BaseWork> childWorks) {
if (getAllReduceSinks(parentWork).size() <= 1) {
return;
}
// Grand-parent works - we need to set these to be the parents of the cloned works.
List<BaseWork> grandParentWorks = sparkWork.getParents(parentWork);
boolean isFirst = true;
for (BaseWork childWork : childWorks) {
BaseWork clonedParentWork = SerializationUtilities.cloneBaseWork(parentWork);
// give the cloned work a different name
clonedParentWork.setName(clonedParentWork.getName().replaceAll("^([a-zA-Z]+)(\\s+)(\\d+)", "$1$2" + GenSparkUtils.getUtils().getNextSeqNumber()));
setStatistics(parentWork, clonedParentWork);
String childReducerName = childWork.getName();
SparkEdgeProperty clonedEdgeProperty = sparkWork.getEdgeProperty(parentWork, childWork);
// the corresponding ReduceSinkOperator.
for (Operator<?> op : clonedParentWork.getAllLeafOperators()) {
if (op instanceof ReduceSinkOperator) {
if (!((ReduceSinkOperator) op).getConf().getOutputName().equals(childReducerName)) {
removeOpRecursive(op);
}
} else if (!isFirst) {
removeOpRecursive(op);
}
}
isFirst = false;
// Then, we need to set up the graph connection. Especially:
// 1, we need to connect this cloned parent work with all the grand-parent works.
// 2, we need to connect this cloned parent work with the corresponding child work.
sparkWork.add(clonedParentWork);
for (BaseWork gpw : grandParentWorks) {
sparkWork.connect(gpw, clonedParentWork, sparkWork.getEdgeProperty(gpw, parentWork));
}
sparkWork.connect(clonedParentWork, childWork, clonedEdgeProperty);
sparkWork.getCloneToWork().put(clonedParentWork, parentWork);
}
sparkWork.remove(parentWork);
}
use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class GenTezWork method getFollowingWorkIndex.
private int getFollowingWorkIndex(TezWork tezWork, UnionWork unionWork, ReduceSinkOperator rs) throws SemanticException {
int index = 0;
for (BaseWork baseWork : tezWork.getChildren(unionWork)) {
TezEdgeProperty edgeProperty = tezWork.getEdgeProperty(unionWork, baseWork);
if (edgeProperty.getEdgeType() != TezEdgeProperty.EdgeType.CONTAINS) {
return index;
}
index++;
}
throw new SemanticException("Following work not found for the reduce sink: " + rs.getName());
}
use of org.apache.hadoop.hive.ql.plan.BaseWork in project hive by apache.
the class TestGenTezWork method testCreateMap.
@Test
public void testCreateMap() throws SemanticException {
proc.process(rs, null, ctx, (Object[]) null);
assertNotNull(ctx.currentTask);
assertTrue(ctx.rootTasks.contains(ctx.currentTask));
TezWork work = ctx.currentTask.getWork();
assertEquals(work.getAllWork().size(), 1);
BaseWork w = work.getAllWork().get(0);
assertTrue(w instanceof MapWork);
MapWork mw = (MapWork) w;
// need to make sure names are set for tez to connect things right
assertNotNull(w.getName());
// map work should start with our ts op
assertSame(mw.getAliasToWork().entrySet().iterator().next().getValue(), ts);
// preceeding work must be set to the newly generated map
assertSame(ctx.preceedingWork, mw);
// should have a new root now
assertSame(ctx.currentRootOperator, fs);
}
Aggregations