use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class TestTezTask method testBuildDag.
@Test
public void testBuildDag() throws IllegalArgumentException, IOException, Exception {
DAG dag = task.build(conf, work, path, appLr, null, new Context(conf));
for (BaseWork w : work.getAllWork()) {
Vertex v = dag.getVertex(w.getName());
assertNotNull(v);
List<Vertex> outs = v.getOutputVertices();
for (BaseWork x : work.getChildren(w)) {
boolean found = false;
for (Vertex u : outs) {
if (u.getName().equals(x.getName())) {
found = true;
break;
}
}
assertTrue(found);
}
}
}
use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param work The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @param appJarLr Local resource for hive-exec.
* @param additionalLr
* @param fileSystem FS corresponding to scratchDir and LocalResources
* @param ctx This query's context
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType) throws Exception {
Vertex v = null;
// BaseWork.
if (work instanceof MapWork) {
v = createVertex(conf, (MapWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
} else if (work instanceof ReduceWork) {
v = createVertex(conf, (ReduceWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx);
} else if (work instanceof MergeJoinWork) {
v = createVertex(conf, (MergeJoinWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
// initialize stats publisher if necessary
if (work.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
// final vertices need to have at least one output
if (!hasChildren) {
v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
}
return v;
}
use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class DagUtils method createVertex.
/*
* Helper function to create Vertex from MapWork.
*/
private Vertex createVertex(JobConf conf, MapWork mapWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
Path tezDir = getTezDir(mrScratchDir);
// set up the operator plan
Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
// create the directories FileSinkOperators need
Utilities.createTmpDirs(conf, mapWork);
// finally create the vertex
Vertex map = null;
// use tez to combine splits
boolean groupSplitsInInputInitializer;
DataSourceDescriptor dataSource;
int numTasks = -1;
@SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
LOG.info("Vertex has custom input? " + vertexHasCustomInput);
if (vertexHasCustomInput) {
groupSplitsInInputInitializer = false;
// grouping happens in execution phase. The input payload should not enable grouping here,
// it will be enabled in the CustomVertex.
inputFormatClass = HiveInputFormat.class;
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
// this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
} else {
// is HiveInputFormat
if (inputFormatClass == HiveInputFormat.class) {
groupSplitsInInputInitializer = true;
} else {
groupSplitsInInputInitializer = false;
}
}
if (mapWork instanceof MergeFileWork) {
Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
// prepare the tmp output directory. The output tmp directory should
// exist before jobClose (before renaming after job completion)
Path tempOutPath = Utilities.toTempPath(outputPath);
try {
FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
if (!tmpOutFS.exists(tempOutPath)) {
tmpOutFS.mkdirs(tempOutPath);
}
} catch (IOException e) {
throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
}
}
// remember mapping of plan to input
conf.set(Utilities.INPUT_NAME, mapWork.getName());
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
// set up the operator plan. (before setting up splits on the AM)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
// the correct plugin.
if (groupSplitsInInputInitializer) {
// Not setting a payload, since the MRInput payload is the same and can be accessed.
InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
} else {
// Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
if (vertexHasCustomInput) {
dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
} else {
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
}
}
} else {
// Setup client side split generation.
// we need to set this, because with HS2 and client side split
// generation we end up not finding the map work. This is
// because of thread local madness (tez split generation is
// multi-threaded - HS2 plan cache uses thread locals). Setting
// VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
// of the map work.
conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(conf, new Path(tezDir, "split_" + mapWork.getName().replaceAll(" ", "_")), true);
numTasks = dataSource.getNumberOfShards();
// set up the operator plan. (after generating splits - that changes configs)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
}
UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
String procClassName = MapTezProcessor.class.getName();
if (mapWork instanceof MergeFileWork) {
procClassName = MergeFileTezProcessor.class.getName();
}
VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
map.setTaskEnvironment(getContainerEnvironment(conf, true));
map.setExecutionContext(executionContext);
map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
assert mapWork.getAliasToWork().keySet().size() == 1;
// Add the actual source input
String alias = mapWork.getAliasToWork().keySet().iterator().next();
map.addDataSource(alias, dataSource);
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
localResources.put(getBaseName(appJarLr), appJarLr);
for (LocalResource lr : additionalLr) {
localResources.put(getBaseName(lr), lr);
}
map.addTaskLocalFiles(localResources);
return map;
}
use of org.apache.tez.dag.api.Vertex in project hive by apache.
the class DagUtils method createVertex.
private Vertex createVertex(JobConf conf, MergeJoinWork mergeJoinWork, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType) throws Exception {
Utilities.setMergeWork(conf, mergeJoinWork, mrScratchDir, false);
if (mergeJoinWork.getMainWork() instanceof MapWork) {
List<BaseWork> mapWorkList = mergeJoinWork.getBaseWorkList();
MapWork mapWork = (MapWork) (mergeJoinWork.getMainWork());
Vertex mergeVx = createVertex(conf, mapWork, appJarLr, additionalLr, fs, mrScratchDir, ctx, vertexType);
conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set
// to false when using this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
for (int i = 0; i < mapWorkList.size(); i++) {
mapWork = (MapWork) (mapWorkList.get(i));
conf.set(TEZ_MERGE_CURRENT_MERGE_FILE_PREFIX, mapWork.getName());
conf.set(Utilities.INPUT_NAME, mapWork.getName());
LOG.info("Going through each work and adding MultiMRInput");
mergeVx.addDataSource(mapWork.getName(), MultiMRInput.createConfigBuilder(conf, HiveInputFormat.class).build());
}
VertexManagerPluginDescriptor desc = VertexManagerPluginDescriptor.create(CustomPartitionVertex.class.getName());
// the +1 to the size is because of the main work.
CustomVertexConfiguration vertexConf = new CustomVertexConfiguration(mergeJoinWork.getMergeJoinOperator().getConf().getNumBuckets(), vertexType, mergeJoinWork.getBigTableAlias(), mapWorkList.size() + 1);
DataOutputBuffer dob = new DataOutputBuffer();
vertexConf.write(dob);
byte[] userPayload = dob.getData();
desc.setUserPayload(UserPayload.create(ByteBuffer.wrap(userPayload)));
mergeVx.setVertexManagerPlugin(desc);
return mergeVx;
} else {
Vertex mergeVx = createVertex(conf, (ReduceWork) mergeJoinWork.getMainWork(), appJarLr, additionalLr, fs, mrScratchDir, ctx);
return mergeVx;
}
}
Aggregations