use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.
the class TestMRRJobsDAGApi method testMRRSleepJobDagSubmitCore.
public State testMRRSleepJobDagSubmitCore(boolean dagViaRPC, boolean killDagWhileRunning, boolean closeSessionBeforeSubmit, TezClient reUseTezSession, boolean genSplitsInAM, Class<? extends InputInitializer> initializerClass, Map<String, LocalResource> additionalLocalResources) throws IOException, InterruptedException, TezException, ClassNotFoundException, YarnException {
LOG.info("\n\n\nStarting testMRRSleepJobDagSubmit().");
JobConf stage1Conf = new JobConf(mrrTezCluster.getConfig());
JobConf stage2Conf = new JobConf(mrrTezCluster.getConfig());
JobConf stage3Conf = new JobConf(mrrTezCluster.getConfig());
stage1Conf.setLong(MRRSleepJob.MAP_SLEEP_TIME, 1);
stage1Conf.setInt(MRRSleepJob.MAP_SLEEP_COUNT, 1);
stage1Conf.setInt(MRJobConfig.NUM_MAPS, 1);
stage1Conf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
stage1Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
stage1Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
stage1Conf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
stage1Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
stage2Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
stage2Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
stage2Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
stage2Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
stage2Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
stage2Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
stage2Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
stage3Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
stage3Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
stage3Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
stage3Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
stage3Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
stage3Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
MRHelpers.translateMRConfToTez(stage1Conf);
MRHelpers.translateMRConfToTez(stage2Conf);
MRHelpers.translateMRConfToTez(stage3Conf);
MRHelpers.configureMRApiUsage(stage1Conf);
MRHelpers.configureMRApiUsage(stage2Conf);
MRHelpers.configureMRApiUsage(stage3Conf);
Path remoteStagingDir = remoteFs.makeQualified(new Path("/tmp", String.valueOf(new Random().nextInt(100000))));
TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir);
UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
UserPayload stage2Payload = TezUtils.createUserPayloadFromConf(stage2Conf);
UserPayload stage3Payload = TezUtils.createUserPayloadFromConf(stage3Conf);
DAG dag = DAG.create("testMRRSleepJobDagSubmit-" + random.nextInt(1000));
Class<? extends InputInitializer> inputInitializerClazz = genSplitsInAM ? (initializerClass == null ? MRInputAMSplitGenerator.class : initializerClass) : null;
LOG.info("Using initializer class: " + initializerClass);
DataSourceDescriptor dsd;
if (!genSplitsInAM) {
dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, remoteStagingDir, true);
} else {
if (initializerClass == null) {
dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).build();
} else {
InputInitializerDescriptor iid = InputInitializerDescriptor.create(inputInitializerClazz.getName());
dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).setCustomInitializerDescriptor(iid).build();
}
}
Vertex stage1Vertex = Vertex.create("map", ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(stage1Payload), dsd.getNumberOfShards(), Resource.newInstance(256, 1));
stage1Vertex.addDataSource("MRInput", dsd);
Vertex stage2Vertex = Vertex.create("ireduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage2Payload), 1, Resource.newInstance(256, 1));
Vertex stage3Vertex = Vertex.create("reduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage3Payload), 1, Resource.newInstance(256, 1));
stage3Conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, true);
DataSinkDescriptor dataSinkDescriptor = MROutputLegacy.createConfigBuilder(stage3Conf, NullOutputFormat.class).build();
Assert.assertFalse(dataSinkDescriptor.getOutputDescriptor().getHistoryText().isEmpty());
stage3Vertex.addDataSink("MROutput", dataSinkDescriptor);
// TODO env, resources
dag.addVertex(stage1Vertex);
dag.addVertex(stage2Vertex);
dag.addVertex(stage3Vertex);
Edge edge1 = Edge.create(stage1Vertex, stage2Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage2Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage2Payload)));
Edge edge2 = Edge.create(stage2Vertex, stage3Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage3Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage3Payload)));
dag.addEdge(edge1);
dag.addEdge(edge2);
TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());
DAGClient dagClient = null;
boolean reuseSession = reUseTezSession != null;
TezClient tezSession = null;
if (!dagViaRPC) {
Preconditions.checkArgument(reuseSession == false);
}
if (!reuseSession) {
TezConfiguration tempTezconf = new TezConfiguration(tezConf);
if (!dagViaRPC) {
tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, false);
} else {
tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true);
}
tezSession = TezClient.create("testsession", tempTezconf);
tezSession.start();
} else {
tezSession = reUseTezSession;
}
if (!dagViaRPC) {
// TODO Use utility method post TEZ-205 to figure out AM arguments etc.
dagClient = tezSession.submitDAG(dag);
}
if (dagViaRPC && closeSessionBeforeSubmit) {
YarnClient yarnClient = YarnClient.createYarnClient();
yarnClient.init(mrrTezCluster.getConfig());
yarnClient.start();
boolean sentKillSession = false;
while (true) {
Thread.sleep(500l);
ApplicationReport appReport = yarnClient.getApplicationReport(tezSession.getAppMasterApplicationId());
if (appReport == null) {
continue;
}
YarnApplicationState appState = appReport.getYarnApplicationState();
if (!sentKillSession) {
if (appState == YarnApplicationState.RUNNING) {
tezSession.stop();
sentKillSession = true;
}
} else {
if (appState == YarnApplicationState.FINISHED || appState == YarnApplicationState.KILLED || appState == YarnApplicationState.FAILED) {
LOG.info("Application completed after sending session shutdown" + ", yarnApplicationState=" + appState + ", finalAppStatus=" + appReport.getFinalApplicationStatus());
Assert.assertEquals(YarnApplicationState.FINISHED, appState);
Assert.assertEquals(FinalApplicationStatus.SUCCEEDED, appReport.getFinalApplicationStatus());
break;
}
}
}
yarnClient.stop();
return null;
}
if (dagViaRPC) {
LOG.info("Submitting dag to tez session with appId=" + tezSession.getAppMasterApplicationId() + " and Dag Name=" + dag.getName());
if (additionalLocalResources != null) {
tezSession.addAppMasterLocalFiles(additionalLocalResources);
}
dagClient = tezSession.submitDAG(dag);
Assert.assertEquals(TezAppMasterStatus.RUNNING, tezSession.getAppMasterStatus());
}
DAGStatus dagStatus = dagClient.getDAGStatus(null);
while (!dagStatus.isCompleted()) {
LOG.info("Waiting for job to complete. Sleeping for 500ms." + " Current state: " + dagStatus.getState());
Thread.sleep(500l);
if (killDagWhileRunning && dagStatus.getState() == DAGStatus.State.RUNNING) {
LOG.info("Killing running dag/session");
if (dagViaRPC) {
tezSession.stop();
} else {
dagClient.tryKillDAG();
}
}
dagStatus = dagClient.getDAGStatus(null);
}
if (!reuseSession) {
tezSession.stop();
}
return dagStatus.getState();
}
use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.
the class CartesianProduct method createDAG.
private DAG createDAG(TezConfiguration tezConf) throws IOException {
InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName());
InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName());
DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null);
Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v1.addDataSource(INPUT, dataSourceDescriptor);
Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v2.addDataSource(INPUT, dataSourceDescriptor);
OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName());
OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName());
DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null);
CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName()));
v3.addDataSink(OUTPUT, dataSinkDescriptor);
v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
edgeManagerDescriptor.setUserPayload(userPayload);
UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build();
EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3).addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty));
}
use of org.apache.tez.dag.api.DataSinkDescriptor in project tez by apache.
the class UnionExample method createDAG.
private DAG createDAG(FileSystem fs, TezConfiguration tezConf, Map<String, LocalResource> localResources, Path stagingDir, String inputPath, String outputPath) throws IOException {
DAG dag = DAG.create("UnionExample");
int numMaps = -1;
Configuration inputConf = new Configuration(tezConf);
inputConf.setBoolean("mapred.mapper.new-api", false);
inputConf.set("mapred.input.format.class", TextInputFormat.class.getName());
inputConf.set(FileInputFormat.INPUT_DIR, inputPath);
MRInput.MRInputConfigBuilder configurer = MRInput.createConfigBuilder(inputConf, null);
DataSourceDescriptor dataSource = configurer.generateSplitsInAM(false).build();
Vertex mapVertex1 = Vertex.create("map1", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex mapVertex2 = Vertex.create("map2", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex mapVertex3 = Vertex.create("map3", ProcessorDescriptor.create(TokenProcessor.class.getName()), numMaps).addDataSource("MRInput", dataSource);
Vertex checkerVertex = Vertex.create("checker", ProcessorDescriptor.create(UnionProcessor.class.getName()), 1);
Configuration outputConf = new Configuration(tezConf);
outputConf.setBoolean("mapred.reducer.new-api", false);
outputConf.set("mapred.output.format.class", TextOutputFormat.class.getName());
outputConf.set(FileOutputFormat.OUTDIR, outputPath);
DataSinkDescriptor od = MROutput.createConfigBuilder(outputConf, null).build();
checkerVertex.addDataSink("union", od);
Configuration allPartsConf = new Configuration(tezConf);
DataSinkDescriptor od2 = MROutput.createConfigBuilder(allPartsConf, TextOutputFormat.class, outputPath + "-all-parts").build();
checkerVertex.addDataSink("all-parts", od2);
Configuration partsConf = new Configuration(tezConf);
DataSinkDescriptor od1 = MROutput.createConfigBuilder(partsConf, TextOutputFormat.class, outputPath + "-parts").build();
VertexGroup unionVertex = dag.createVertexGroup("union", mapVertex1, mapVertex2);
unionVertex.addDataSink("parts", od1);
OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
dag.addVertex(mapVertex1).addVertex(mapVertex2).addVertex(mapVertex3).addVertex(checkerVertex).addEdge(Edge.create(mapVertex3, checkerVertex, edgeConf.createDefaultEdgeProperty())).addEdge(GroupInputEdge.create(unionVertex, checkerVertex, edgeConf.createDefaultEdgeProperty(), InputDescriptor.create(ConcatenatedMergedKeyValuesInput.class.getName())));
return dag;
}
use of org.apache.tez.dag.api.DataSinkDescriptor in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param workUnit The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork workUnit, Path scratchDir, TezWork tezWork, Map<String, LocalResource> localResources) throws Exception {
Vertex vertex;
// simply dispatch the call to the right method for the actual (sub-) type of
// BaseWork.
VertexType vertexType = tezWork.getVertexType(workUnit);
if (workUnit instanceof MapWork) {
vertex = createVertexFromMapWork(conf, (MapWork) workUnit, scratchDir, vertexType);
} else if (workUnit instanceof ReduceWork) {
vertex = createVertexFromReduceWork(conf, (ReduceWork) workUnit, scratchDir);
} else if (workUnit instanceof MergeJoinWork) {
vertex = createVertexFromMergeWork(conf, (MergeJoinWork) workUnit, scratchDir, vertexType);
// set VertexManagerPlugin if whether it's a cross product destination vertex
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(workUnit)) {
if (tezWork.getEdgeType(parentWork, workUnit) == EdgeType.XPROD_EDGE) {
crossProductSources.add(parentWork.getName());
}
}
if (!crossProductSources.isEmpty()) {
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
vertex.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
// parallelism shouldn't be set for cartesian product vertex
}
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(workUnit);
vertex.addTaskLocalFiles(localResources);
vertex.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
vertex.setExecutionContext(vertexExecutionContext);
// initialize stats publisher if necessary
if (workUnit.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(workUnit, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
final Class outputKlass;
if (HiveOutputFormatImpl.class.getName().equals(conf.get("mapred.output.format.class"))) {
// Hive uses this output format, when it is going to write all its data through FS operator
outputKlass = NullMROutput.class;
} else {
outputKlass = MROutput.class;
}
// If there is a fileSink add a DataSink to the vertex
boolean hasFileSink = workUnit.getAllOperators().stream().anyMatch(o -> o instanceof FileSinkOperator);
// final vertices need to have at least one output
boolean endVertex = tezWork.getLeaves().contains(workUnit);
if (endVertex || hasFileSink) {
OutputCommitterDescriptor ocd = null;
String committer = HiveConf.getVar(conf, ConfVars.TEZ_MAPREDUCE_OUTPUT_COMMITTER);
if (committer != null && !committer.isEmpty()) {
ocd = OutputCommitterDescriptor.create(committer);
}
vertex.addDataSink("out_" + workUnit.getName(), new DataSinkDescriptor(OutputDescriptor.create(outputKlass.getName()).setUserPayload(vertex.getProcessorDescriptor().getUserPayload()), ocd, null));
}
return vertex;
}
use of org.apache.tez.dag.api.DataSinkDescriptor in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param work The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @param appJarLr Local resource for hive-exec.
* @param additionalLr
* @param fileSystem FS corresponding to scratchDir and LocalResources
* @param ctx This query's context
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork work, Path scratchDir, LocalResource appJarLr, List<LocalResource> additionalLr, FileSystem fileSystem, Context ctx, boolean hasChildren, TezWork tezWork, VertexType vertexType) throws Exception {
Vertex v = null;
// BaseWork.
if (work instanceof MapWork) {
v = createVertex(conf, (MapWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
} else if (work instanceof ReduceWork) {
v = createVertex(conf, (ReduceWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx);
} else if (work instanceof MergeJoinWork) {
v = createVertex(conf, (MergeJoinWork) work, appJarLr, additionalLr, fileSystem, scratchDir, ctx, vertexType);
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
// initialize stats publisher if necessary
if (work.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(work, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
// final vertices need to have at least one output
if (!hasChildren) {
v.addDataSink("out_" + work.getName(), new DataSinkDescriptor(OutputDescriptor.create(MROutput.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(conf)), null, null));
}
return v;
}
Aggregations