use of org.apache.tez.dag.api.OutputCommitterDescriptor in project tez by apache.
the class CartesianProduct method createDAG.
private DAG createDAG(TezConfiguration tezConf) throws IOException {
InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName());
InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName());
DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null);
Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v1.addDataSource(INPUT, dataSourceDescriptor);
Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
v2.addDataSource(INPUT, dataSourceDescriptor);
OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName());
OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName());
DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null);
CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName()));
v3.addDataSink(OUTPUT, dataSinkDescriptor);
v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
edgeManagerDescriptor.setUserPayload(userPayload);
UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build();
EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3).addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty));
}
use of org.apache.tez.dag.api.OutputCommitterDescriptor in project hive by apache.
the class DagUtils method createVertex.
/**
* Create a vertex from a given work object.
*
* @param conf JobConf to be used to this execution unit
* @param workUnit The instance of BaseWork representing the actual work to be performed
* by this vertex.
* @param scratchDir HDFS scratch dir for this execution unit.
* @return Vertex
*/
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork workUnit, Path scratchDir, TezWork tezWork, Map<String, LocalResource> localResources) throws Exception {
Vertex vertex;
// simply dispatch the call to the right method for the actual (sub-) type of
// BaseWork.
VertexType vertexType = tezWork.getVertexType(workUnit);
if (workUnit instanceof MapWork) {
vertex = createVertexFromMapWork(conf, (MapWork) workUnit, scratchDir, vertexType);
} else if (workUnit instanceof ReduceWork) {
vertex = createVertexFromReduceWork(conf, (ReduceWork) workUnit, scratchDir);
} else if (workUnit instanceof MergeJoinWork) {
vertex = createVertexFromMergeWork(conf, (MergeJoinWork) workUnit, scratchDir, vertexType);
// set VertexManagerPlugin if whether it's a cross product destination vertex
List<String> crossProductSources = new ArrayList<>();
for (BaseWork parentWork : tezWork.getParents(workUnit)) {
if (tezWork.getEdgeType(parentWork, workUnit) == EdgeType.XPROD_EDGE) {
crossProductSources.add(parentWork.getName());
}
}
if (!crossProductSources.isEmpty()) {
CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
vertex.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
// parallelism shouldn't be set for cartesian product vertex
}
} else {
// something is seriously wrong if this is happening
throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
}
VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(workUnit);
vertex.addTaskLocalFiles(localResources);
vertex.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
vertex.setExecutionContext(vertexExecutionContext);
// initialize stats publisher if necessary
if (workUnit.isGatheringStats()) {
StatsPublisher statsPublisher;
StatsFactory factory = StatsFactory.newFactory(conf);
if (factory != null) {
StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(workUnit, conf));
statsPublisher = factory.getStatsPublisher();
if (!statsPublisher.init(sCntxt)) {
// creating stats table if not exists
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
}
}
}
}
final Class outputKlass;
if (HiveOutputFormatImpl.class.getName().equals(conf.get("mapred.output.format.class"))) {
// Hive uses this output format, when it is going to write all its data through FS operator
outputKlass = NullMROutput.class;
} else {
outputKlass = MROutput.class;
}
// If there is a fileSink add a DataSink to the vertex
boolean hasFileSink = workUnit.getAllOperators().stream().anyMatch(o -> o instanceof FileSinkOperator);
// final vertices need to have at least one output
boolean endVertex = tezWork.getLeaves().contains(workUnit);
if (endVertex || hasFileSink) {
OutputCommitterDescriptor ocd = null;
String committer = HiveConf.getVar(conf, ConfVars.TEZ_MAPREDUCE_OUTPUT_COMMITTER);
if (committer != null && !committer.isEmpty()) {
ocd = OutputCommitterDescriptor.create(committer);
}
vertex.addDataSink("out_" + workUnit.getName(), new DataSinkDescriptor(OutputDescriptor.create(outputKlass.getName()).setUserPayload(vertex.getProcessorDescriptor().getUserPayload()), ocd, null));
}
return vertex;
}
use of org.apache.tez.dag.api.OutputCommitterDescriptor in project tez by apache.
the class TestCommit method createDAGPlan_SingleVertexWith2Committer.
// used for route event error in VM
private DAGPlan createDAGPlan_SingleVertexWith2Committer(boolean commit1Succeed, boolean commit2Succeed, boolean customVM) throws IOException {
LOG.info("Setting up group dag plan");
int dummyTaskCount = 1;
Resource dummyTaskResource = Resource.newInstance(1, 1);
org.apache.tez.dag.api.Vertex v1 = org.apache.tez.dag.api.Vertex.create("vertex1", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
if (customVM) {
v1.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(FailOnVMEventReceivedlVertexManager.class.getName()));
}
OutputCommitterDescriptor ocd1 = OutputCommitterDescriptor.create(CountingOutputCommitter.class.getName()).setUserPayload(UserPayload.create(ByteBuffer.wrap(new CountingOutputCommitter.CountingOutputCommitterConfig(!commit1Succeed, true).toUserPayload())));
OutputCommitterDescriptor ocd2 = OutputCommitterDescriptor.create(CountingOutputCommitter.class.getName()).setUserPayload(UserPayload.create(ByteBuffer.wrap(new CountingOutputCommitter.CountingOutputCommitterConfig(!commit2Succeed, true).toUserPayload())));
DAG dag = DAG.create("testDag");
dag.addVertex(v1);
OutputDescriptor outDesc = OutputDescriptor.create("output.class");
v1.addDataSink("v1Out_1", DataSinkDescriptor.create(outDesc, ocd1, null));
v1.addDataSink("v1Out_2", DataSinkDescriptor.create(outDesc, ocd2, null));
return dag.createDag(conf, null, null, null, true);
}
use of org.apache.tez.dag.api.OutputCommitterDescriptor in project tez by apache.
the class TestDAGImpl method createGroupDAGPlan.
// Create a plan with 3 vertices: A, B, C. Group(A,B)->C
static DAGPlan createGroupDAGPlan() {
LOG.info("Setting up group dag plan");
int dummyTaskCount = 1;
Resource dummyTaskResource = Resource.newInstance(1, 1);
org.apache.tez.dag.api.Vertex v1 = org.apache.tez.dag.api.Vertex.create("vertex1", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
org.apache.tez.dag.api.Vertex v2 = org.apache.tez.dag.api.Vertex.create("vertex2", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
org.apache.tez.dag.api.Vertex v3 = org.apache.tez.dag.api.Vertex.create("vertex3", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
DAG dag = DAG.create("testDag");
String groupName1 = "uv12";
OutputCommitterDescriptor ocd = OutputCommitterDescriptor.create(TotalCountingOutputCommitter.class.getName());
org.apache.tez.dag.api.VertexGroup uv12 = dag.createVertexGroup(groupName1, v1, v2);
OutputDescriptor outDesc = OutputDescriptor.create("output.class");
uv12.addDataSink("uvOut", DataSinkDescriptor.create(outDesc, ocd, null));
v3.addDataSink("uvOut", DataSinkDescriptor.create(outDesc, ocd, null));
GroupInputEdge e1 = GroupInputEdge.create(uv12, v3, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("dummy output class"), InputDescriptor.create("dummy input class")), InputDescriptor.create("merge.class"));
dag.addVertex(v1);
dag.addVertex(v2);
dag.addVertex(v3);
dag.addEdge(e1);
return dag.createDag(conf, null, null, null, true);
}
use of org.apache.tez.dag.api.OutputCommitterDescriptor in project tez by apache.
the class TestDAGRecovery2 method testFailingCommitter.
@Test(timeout = 120000)
public void testFailingCommitter() throws Exception {
DAG dag = SimpleVTestDAG.createDAG("FailingCommitterDAG", null);
OutputDescriptor od = OutputDescriptor.create(MultiAttemptDAG.NoOpOutput.class.getName());
od.setUserPayload(UserPayload.create(ByteBuffer.wrap(new MultiAttemptDAG.FailingOutputCommitter.FailingOutputCommitterConfig(true).toUserPayload())));
OutputCommitterDescriptor ocd = OutputCommitterDescriptor.create(MultiAttemptDAG.FailingOutputCommitter.class.getName());
dag.getVertex("v3").addDataSink("FailingOutput", DataSinkDescriptor.create(od, ocd, null));
runDAGAndVerify(dag, State.FAILED);
}
Aggregations