Search in sources :

Example 6 with OutputCommitterDescriptor

use of org.apache.tez.dag.api.OutputCommitterDescriptor in project tez by apache.

the class CartesianProduct method createDAG.

private DAG createDAG(TezConfiguration tezConf) throws IOException {
    InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName());
    InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName());
    DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null);
    Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v1.addDataSource(INPUT, dataSourceDescriptor);
    Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v2.addDataSource(INPUT, dataSourceDescriptor);
    OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName());
    OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName());
    DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null);
    CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
    UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
    Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName()));
    v3.addDataSink(OUTPUT, dataSinkDescriptor);
    v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
    EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
    edgeManagerDescriptor.setUserPayload(userPayload);
    UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build();
    EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
    return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3).addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty));
}
Also used : InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) UserPayload(org.apache.tez.dag.api.UserPayload) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) CartesianProductEdgeManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductEdgeManager) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) UnorderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 7 with OutputCommitterDescriptor

use of org.apache.tez.dag.api.OutputCommitterDescriptor in project hive by apache.

the class DagUtils method createVertex.

/**
 * Create a vertex from a given work object.
 *
 * @param conf JobConf to be used to this execution unit
 * @param workUnit The instance of BaseWork representing the actual work to be performed
 * by this vertex.
 * @param scratchDir HDFS scratch dir for this execution unit.
 * @return Vertex
 */
@SuppressWarnings("deprecation")
public Vertex createVertex(JobConf conf, BaseWork workUnit, Path scratchDir, TezWork tezWork, Map<String, LocalResource> localResources) throws Exception {
    Vertex vertex;
    // simply dispatch the call to the right method for the actual (sub-) type of
    // BaseWork.
    VertexType vertexType = tezWork.getVertexType(workUnit);
    if (workUnit instanceof MapWork) {
        vertex = createVertexFromMapWork(conf, (MapWork) workUnit, scratchDir, vertexType);
    } else if (workUnit instanceof ReduceWork) {
        vertex = createVertexFromReduceWork(conf, (ReduceWork) workUnit, scratchDir);
    } else if (workUnit instanceof MergeJoinWork) {
        vertex = createVertexFromMergeWork(conf, (MergeJoinWork) workUnit, scratchDir, vertexType);
        // set VertexManagerPlugin if whether it's a cross product destination vertex
        List<String> crossProductSources = new ArrayList<>();
        for (BaseWork parentWork : tezWork.getParents(workUnit)) {
            if (tezWork.getEdgeType(parentWork, workUnit) == EdgeType.XPROD_EDGE) {
                crossProductSources.add(parentWork.getName());
            }
        }
        if (!crossProductSources.isEmpty()) {
            CartesianProductConfig cpConfig = new CartesianProductConfig(crossProductSources);
            vertex.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(cpConfig.toUserPayload(new TezConfiguration(conf))));
        // parallelism shouldn't be set for cartesian product vertex
        }
    } else {
        // something is seriously wrong if this is happening
        throw new HiveException(ErrorMsg.GENERIC_ERROR.getErrorCodedMsg());
    }
    VertexExecutionContext vertexExecutionContext = createVertexExecutionContext(workUnit);
    vertex.addTaskLocalFiles(localResources);
    vertex.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    vertex.setExecutionContext(vertexExecutionContext);
    // initialize stats publisher if necessary
    if (workUnit.isGatheringStats()) {
        StatsPublisher statsPublisher;
        StatsFactory factory = StatsFactory.newFactory(conf);
        if (factory != null) {
            StatsCollectionContext sCntxt = new StatsCollectionContext(conf);
            sCntxt.setStatsTmpDirs(Utilities.getStatsTmpDirs(workUnit, conf));
            statsPublisher = factory.getStatsPublisher();
            if (!statsPublisher.init(sCntxt)) {
                // creating stats table if not exists
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw new HiveException(ErrorMsg.STATSPUBLISHER_INITIALIZATION_ERROR.getErrorCodedMsg());
                }
            }
        }
    }
    final Class outputKlass;
    if (HiveOutputFormatImpl.class.getName().equals(conf.get("mapred.output.format.class"))) {
        // Hive uses this output format, when it is going to write all its data through FS operator
        outputKlass = NullMROutput.class;
    } else {
        outputKlass = MROutput.class;
    }
    // If there is a fileSink add a DataSink to the vertex
    boolean hasFileSink = workUnit.getAllOperators().stream().anyMatch(o -> o instanceof FileSinkOperator);
    // final vertices need to have at least one output
    boolean endVertex = tezWork.getLeaves().contains(workUnit);
    if (endVertex || hasFileSink) {
        OutputCommitterDescriptor ocd = null;
        String committer = HiveConf.getVar(conf, ConfVars.TEZ_MAPREDUCE_OUTPUT_COMMITTER);
        if (committer != null && !committer.isEmpty()) {
            ocd = OutputCommitterDescriptor.create(committer);
        }
        vertex.addDataSink("out_" + workUnit.getName(), new DataSinkDescriptor(OutputDescriptor.create(outputKlass.getName()).setUserPayload(vertex.getProcessorDescriptor().getUserPayload()), ocd, null));
    }
    return vertex;
}
Also used : StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeJoinWork(org.apache.hadoop.hive.ql.plan.MergeJoinWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) ArrayList(java.util.ArrayList) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) StatsPublisher(org.apache.hadoop.hive.ql.stats.StatsPublisher) StatsFactory(org.apache.hadoop.hive.ql.stats.StatsFactory) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) HiveOutputFormatImpl(org.apache.hadoop.hive.ql.io.HiveOutputFormatImpl) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) VertexType(org.apache.hadoop.hive.ql.plan.TezWork.VertexType) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Example 8 with OutputCommitterDescriptor

use of org.apache.tez.dag.api.OutputCommitterDescriptor in project tez by apache.

the class TestCommit method createDAGPlan_SingleVertexWith2Committer.

// used for route event error in VM
private DAGPlan createDAGPlan_SingleVertexWith2Committer(boolean commit1Succeed, boolean commit2Succeed, boolean customVM) throws IOException {
    LOG.info("Setting up group dag plan");
    int dummyTaskCount = 1;
    Resource dummyTaskResource = Resource.newInstance(1, 1);
    org.apache.tez.dag.api.Vertex v1 = org.apache.tez.dag.api.Vertex.create("vertex1", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
    if (customVM) {
        v1.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(FailOnVMEventReceivedlVertexManager.class.getName()));
    }
    OutputCommitterDescriptor ocd1 = OutputCommitterDescriptor.create(CountingOutputCommitter.class.getName()).setUserPayload(UserPayload.create(ByteBuffer.wrap(new CountingOutputCommitter.CountingOutputCommitterConfig(!commit1Succeed, true).toUserPayload())));
    OutputCommitterDescriptor ocd2 = OutputCommitterDescriptor.create(CountingOutputCommitter.class.getName()).setUserPayload(UserPayload.create(ByteBuffer.wrap(new CountingOutputCommitter.CountingOutputCommitterConfig(!commit2Succeed, true).toUserPayload())));
    DAG dag = DAG.create("testDag");
    dag.addVertex(v1);
    OutputDescriptor outDesc = OutputDescriptor.create("output.class");
    v1.addDataSink("v1Out_1", DataSinkDescriptor.create(outDesc, ocd1, null));
    v1.addDataSink("v1Out_2", DataSinkDescriptor.create(outDesc, ocd2, null));
    return dag.createDag(conf, null, null, null, true);
}
Also used : OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) Resource(org.apache.hadoop.yarn.api.records.Resource) DAG(org.apache.tez.dag.api.DAG)

Example 9 with OutputCommitterDescriptor

use of org.apache.tez.dag.api.OutputCommitterDescriptor in project tez by apache.

the class TestDAGImpl method createGroupDAGPlan.

// Create a plan with 3 vertices: A, B, C. Group(A,B)->C
static DAGPlan createGroupDAGPlan() {
    LOG.info("Setting up group dag plan");
    int dummyTaskCount = 1;
    Resource dummyTaskResource = Resource.newInstance(1, 1);
    org.apache.tez.dag.api.Vertex v1 = org.apache.tez.dag.api.Vertex.create("vertex1", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
    org.apache.tez.dag.api.Vertex v2 = org.apache.tez.dag.api.Vertex.create("vertex2", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
    org.apache.tez.dag.api.Vertex v3 = org.apache.tez.dag.api.Vertex.create("vertex3", ProcessorDescriptor.create("Processor"), dummyTaskCount, dummyTaskResource);
    DAG dag = DAG.create("testDag");
    String groupName1 = "uv12";
    OutputCommitterDescriptor ocd = OutputCommitterDescriptor.create(TotalCountingOutputCommitter.class.getName());
    org.apache.tez.dag.api.VertexGroup uv12 = dag.createVertexGroup(groupName1, v1, v2);
    OutputDescriptor outDesc = OutputDescriptor.create("output.class");
    uv12.addDataSink("uvOut", DataSinkDescriptor.create(outDesc, ocd, null));
    v3.addDataSink("uvOut", DataSinkDescriptor.create(outDesc, ocd, null));
    GroupInputEdge e1 = GroupInputEdge.create(uv12, v3, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create("dummy output class"), InputDescriptor.create("dummy input class")), InputDescriptor.create("merge.class"));
    dag.addVertex(v1);
    dag.addVertex(v2);
    dag.addVertex(v3);
    dag.addEdge(e1);
    return dag.createDag(conf, null, null, null, true);
}
Also used : OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) Resource(org.apache.hadoop.yarn.api.records.Resource) DAG(org.apache.tez.dag.api.DAG) ByteString(com.google.protobuf.ByteString) PlanTaskLocationHint(org.apache.tez.dag.api.records.DAGProtos.PlanTaskLocationHint) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) GroupInputEdge(org.apache.tez.dag.api.GroupInputEdge)

Example 10 with OutputCommitterDescriptor

use of org.apache.tez.dag.api.OutputCommitterDescriptor in project tez by apache.

the class TestDAGRecovery2 method testFailingCommitter.

@Test(timeout = 120000)
public void testFailingCommitter() throws Exception {
    DAG dag = SimpleVTestDAG.createDAG("FailingCommitterDAG", null);
    OutputDescriptor od = OutputDescriptor.create(MultiAttemptDAG.NoOpOutput.class.getName());
    od.setUserPayload(UserPayload.create(ByteBuffer.wrap(new MultiAttemptDAG.FailingOutputCommitter.FailingOutputCommitterConfig(true).toUserPayload())));
    OutputCommitterDescriptor ocd = OutputCommitterDescriptor.create(MultiAttemptDAG.FailingOutputCommitter.class.getName());
    dag.getVertex("v3").addDataSink("FailingOutput", DataSinkDescriptor.create(od, ocd, null));
    runDAGAndVerify(dag, State.FAILED);
}
Also used : MultiAttemptDAG(org.apache.tez.test.dag.MultiAttemptDAG) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) MultiAttemptDAG(org.apache.tez.test.dag.MultiAttemptDAG) DAG(org.apache.tez.dag.api.DAG) SimpleVTestDAG(org.apache.tez.test.dag.SimpleVTestDAG) Test(org.junit.Test)

Aggregations

OutputCommitterDescriptor (org.apache.tez.dag.api.OutputCommitterDescriptor)11 OutputDescriptor (org.apache.tez.dag.api.OutputDescriptor)9 DAG (org.apache.tez.dag.api.DAG)7 Resource (org.apache.hadoop.yarn.api.records.Resource)5 GroupInputEdge (org.apache.tez.dag.api.GroupInputEdge)4 Vertex (org.apache.tez.dag.api.Vertex)4 UserPayload (org.apache.tez.dag.api.UserPayload)3 Configuration (org.apache.hadoop.conf.Configuration)2 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)2 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)2 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)2 CartesianProductConfig (org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig)2 CartesianProductVertexManager (org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager)2 ByteString (com.google.protobuf.ByteString)1 ArrayList (java.util.ArrayList)1 TreeMap (java.util.TreeMap)1 ParseException (org.apache.commons.cli.ParseException)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1