Search in sources :

Example 16 with InputDescriptor

use of org.apache.tez.dag.api.InputDescriptor in project tez by apache.

the class VertexInitializedEvent method toProto.

public RecoveryProtos.VertexInitializedProto toProto() throws IOException {
    VertexInitializedProto.Builder builder = VertexInitializedProto.newBuilder();
    if (additionalInputs != null && !additionalInputs.isEmpty()) {
        for (RootInputLeafOutput<InputDescriptor, InputInitializerDescriptor> input : additionalInputs.values()) {
            RootInputLeafOutputProto.Builder inputBuilder = RootInputLeafOutputProto.newBuilder();
            inputBuilder.setName(input.getName());
            if (input.getControllerDescriptor() != null) {
                inputBuilder.setControllerDescriptor(DagTypeConverters.convertToDAGPlan(input.getControllerDescriptor()));
            }
            inputBuilder.setIODescriptor(DagTypeConverters.convertToDAGPlan(input.getIODescriptor()));
            builder.addInputs(inputBuilder.build());
        }
    }
    if (initGeneratedEvents != null && !initGeneratedEvents.isEmpty()) {
        for (TezEvent event : initGeneratedEvents) {
            builder.addInitGeneratedEvents(TezEventUtils.toProto(event));
        }
    }
    return builder.setVertexId(vertexID.toString()).setVertexName(vertexName).setInitRequestedTime(initRequestedTime).setInitTime(initedTime).setNumTasks(numTasks).build();
}
Also used : VertexInitializedProto(org.apache.tez.dag.recovery.records.RecoveryProtos.VertexInitializedProto) InputDescriptor(org.apache.tez.dag.api.InputDescriptor) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) RootInputLeafOutputProto(org.apache.tez.dag.api.records.DAGProtos.RootInputLeafOutputProto) TezEvent(org.apache.tez.runtime.api.impl.TezEvent)

Example 17 with InputDescriptor

use of org.apache.tez.dag.api.InputDescriptor in project tez by apache.

the class CartesianProduct method createDAG.

private DAG createDAG(TezConfiguration tezConf) throws IOException {
    InputDescriptor inputDescriptor = InputDescriptor.create(FakeInput.class.getName());
    InputInitializerDescriptor inputInitializerDescriptor = InputInitializerDescriptor.create(FakeInputInitializer.class.getName());
    DataSourceDescriptor dataSourceDescriptor = DataSourceDescriptor.create(inputDescriptor, inputInitializerDescriptor, null);
    Vertex v1 = Vertex.create(VERTEX1, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v1.addDataSource(INPUT, dataSourceDescriptor);
    Vertex v2 = Vertex.create(VERTEX2, ProcessorDescriptor.create(TokenProcessor.class.getName()));
    v2.addDataSource(INPUT, dataSourceDescriptor);
    OutputDescriptor outputDescriptor = OutputDescriptor.create(FakeOutput.class.getName());
    OutputCommitterDescriptor outputCommitterDescriptor = OutputCommitterDescriptor.create(FakeOutputCommitter.class.getName());
    DataSinkDescriptor dataSinkDescriptor = DataSinkDescriptor.create(outputDescriptor, outputCommitterDescriptor, null);
    CartesianProductConfig cartesianProductConfig = new CartesianProductConfig(Arrays.asList(sourceVertices));
    UserPayload userPayload = cartesianProductConfig.toUserPayload(tezConf);
    Vertex v3 = Vertex.create(VERTEX3, ProcessorDescriptor.create(JoinProcessor.class.getName()));
    v3.addDataSink(OUTPUT, dataSinkDescriptor);
    v3.setVertexManagerPlugin(VertexManagerPluginDescriptor.create(CartesianProductVertexManager.class.getName()).setUserPayload(userPayload));
    EdgeManagerPluginDescriptor edgeManagerDescriptor = EdgeManagerPluginDescriptor.create(CartesianProductEdgeManager.class.getName());
    edgeManagerDescriptor.setUserPayload(userPayload);
    UnorderedPartitionedKVEdgeConfig edgeConf = UnorderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), RoundRobinPartitioner.class.getName()).build();
    EdgeProperty edgeProperty = edgeConf.createDefaultCustomEdgeProperty(edgeManagerDescriptor);
    return DAG.create("CrossProduct").addVertex(v1).addVertex(v2).addVertex(v3).addEdge(Edge.create(v1, v3, edgeProperty)).addEdge(Edge.create(v2, v3, edgeProperty));
}
Also used : InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) UserPayload(org.apache.tez.dag.api.UserPayload) OutputCommitterDescriptor(org.apache.tez.dag.api.OutputCommitterDescriptor) CartesianProductVertexManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductVertexManager) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) EdgeManagerPluginDescriptor(org.apache.tez.dag.api.EdgeManagerPluginDescriptor) CartesianProductEdgeManager(org.apache.tez.runtime.library.cartesianproduct.CartesianProductEdgeManager) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) UnorderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.UnorderedPartitionedKVEdgeConfig) EdgeProperty(org.apache.tez.dag.api.EdgeProperty) CartesianProductConfig(org.apache.tez.runtime.library.cartesianproduct.CartesianProductConfig) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 18 with InputDescriptor

use of org.apache.tez.dag.api.InputDescriptor in project hive by apache.

the class DagUtils method createVertex.

/*
   * Helper function to create Vertex from MapWork.
   */
private Vertex createVertex(JobConf conf, MapWork mapWork, FileSystem fs, Path mrScratchDir, Context ctx, VertexType vertexType, Map<String, LocalResource> localResources) throws Exception {
    // set up the operator plan
    Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);
    // finally create the vertex
    Vertex map = null;
    // use tez to combine splits
    boolean groupSplitsInInputInitializer;
    DataSourceDescriptor dataSource;
    int numTasks = -1;
    @SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
        groupSplitsInInputInitializer = false;
        // grouping happens in execution phase. The input payload should not enable grouping here,
        // it will be enabled in the CustomVertex.
        inputFormatClass = HiveInputFormat.class;
        conf.setClass("mapred.input.format.class", HiveInputFormat.class, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
        // this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
        // is HiveInputFormat
        if (inputFormatClass == HiveInputFormat.class) {
            groupSplitsInInputInitializer = true;
        } else {
            groupSplitsInInputInitializer = false;
        }
    }
    if (mapWork instanceof MergeFileWork) {
        Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
        // prepare the tmp output directory. The output tmp directory should
        // exist before jobClose (before renaming after job completion)
        Path tempOutPath = Utilities.toTempPath(outputPath);
        try {
            FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
            if (!tmpOutFS.exists(tempOutPath)) {
                tmpOutFS.mkdirs(tempOutPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
        }
    }
    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
        // set up the operator plan. (before setting up splits on the AM)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
        // the correct plugin.
        if (groupSplitsInInputInitializer) {
            // Not setting a payload, since the MRInput payload is the same and can be accessed.
            InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
            dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
        } else {
            // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
            if (vertexHasCustomInput && vertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
                // SMB Join.
                dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            } else {
                dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            }
        }
    } else {
        // Setup client side split generation.
        // we need to set this, because with HS2 and client side split
        // generation we end up not finding the map work. This is
        // because of thread local madness (tez split generation is
        // multi-threaded - HS2 plan cache uses thread locals). Setting
        // VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
        // of the map work.
        conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
        conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
        InputSplitInfo inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(conf, false, 0);
        InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName());
        InputDescriptor inputDescriptor = InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(conf)).setSplits(inputSplitInfo.getSplitsProto()).build().toByteString().asReadOnlyByteBuffer()));
        dataSource = DataSourceDescriptor.create(inputDescriptor, descriptor, null);
        numTasks = inputSplitInfo.getNumTasks();
        // set up the operator plan. (after generating splits - that changes configs)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }
    UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
    String procClassName = MapTezProcessor.class.getName();
    if (mapWork instanceof MergeFileWork) {
        procClassName = MergeFileTezProcessor.class.getName();
    }
    VertexExecutionContext executionContext = createVertexExecutionContext(mapWork);
    map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    map.setTaskEnvironment(getContainerEnvironment(conf, true));
    map.setExecutionContext(executionContext);
    map.setTaskLaunchCmdOpts(getContainerJavaOpts(conf));
    assert mapWork.getAliasToWork().keySet().size() == 1;
    // Add the actual source input
    String alias = mapWork.getAliasToWork().keySet().iterator().next();
    map.addDataSource(alias, dataSource);
    map.addTaskLocalFiles(localResources);
    return map;
}
Also used : Path(org.apache.hadoop.fs.Path) InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) UserPayload(org.apache.tez.dag.api.UserPayload) VertexExecutionContext(org.apache.tez.dag.api.Vertex.VertexExecutionContext) InputSplitInfo(org.apache.tez.mapreduce.hadoop.InputSplitInfo) IOException(java.io.IOException) MRInputSplitDistributor(org.apache.tez.mapreduce.common.MRInputSplitDistributor) FileSystem(org.apache.hadoop.fs.FileSystem) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 19 with InputDescriptor

use of org.apache.tez.dag.api.InputDescriptor in project tez by apache.

the class TestRootInputInitializerManager method testCorrectUgiUsage.

@Test(timeout = 5000)
public void testCorrectUgiUsage() throws TezException, InterruptedException {
    Vertex vertex = mock(Vertex.class);
    doReturn(mock(TezVertexID.class)).when(vertex).getVertexId();
    AppContext appContext = mock(AppContext.class);
    doReturn(new DefaultHadoopShim()).when(appContext).getHadoopShim();
    doReturn(mock(EventHandler.class)).when(appContext).getEventHandler();
    UserGroupInformation dagUgi = UserGroupInformation.createRemoteUser("fakeuser");
    StateChangeNotifier stateChangeNotifier = mock(StateChangeNotifier.class);
    RootInputInitializerManager rootInputInitializerManager = new RootInputInitializerManager(vertex, appContext, dagUgi, stateChangeNotifier);
    InputDescriptor id = mock(InputDescriptor.class);
    InputInitializerDescriptor iid = InputInitializerDescriptor.create(InputInitializerForUgiTest.class.getName());
    RootInputLeafOutput<InputDescriptor, InputInitializerDescriptor> rootInput = new RootInputLeafOutput<>("InputName", id, iid);
    rootInputInitializerManager.runInputInitializers(Collections.singletonList(rootInput));
    InputInitializerForUgiTest.awaitInitialize();
    assertEquals(dagUgi, InputInitializerForUgiTest.ctorUgi);
    assertEquals(dagUgi, InputInitializerForUgiTest.initializeUgi);
}
Also used : InputDescriptor(org.apache.tez.dag.api.InputDescriptor) RootInputLeafOutput(org.apache.tez.dag.api.RootInputLeafOutput) AppContext(org.apache.tez.dag.app.AppContext) EventHandler(org.apache.hadoop.yarn.event.EventHandler) DefaultHadoopShim(org.apache.tez.hadoop.shim.DefaultHadoopShim) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) TezVertexID(org.apache.tez.dag.records.TezVertexID) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) Test(org.junit.Test)

Example 20 with InputDescriptor

use of org.apache.tez.dag.api.InputDescriptor in project tez by apache.

the class TestRootInputInitializerManager method testEventBeforeSuccess.

// Simple testing. No events if task doesn't succeed.
// Also exercises path where two attempts are reported as successful via the stateChangeNotifier.
// Primarily a failure scenario, when a Task moves back to running from success
// Order event1, success1, event2, success2
@SuppressWarnings("unchecked")
@Test(timeout = 5000)
public void testEventBeforeSuccess() throws Exception {
    InputDescriptor id = mock(InputDescriptor.class);
    InputInitializerDescriptor iid = mock(InputInitializerDescriptor.class);
    RootInputLeafOutput<InputDescriptor, InputInitializerDescriptor> rootInput = new RootInputLeafOutput<InputDescriptor, InputInitializerDescriptor>("InputName", id, iid);
    InputInitializer initializer = mock(InputInitializer.class);
    InputInitializerContext initializerContext = mock(InputInitializerContext.class);
    Vertex vertex = mock(Vertex.class);
    StateChangeNotifier stateChangeNotifier = mock(StateChangeNotifier.class);
    AppContext appContext = mock(AppContext.class, RETURNS_DEEP_STUBS);
    RootInputInitializerManager.InitializerWrapper initializerWrapper = new RootInputInitializerManager.InitializerWrapper(rootInput, initializer, initializerContext, vertex, stateChangeNotifier, appContext);
    ApplicationId appId = ApplicationId.newInstance(1000, 1);
    TezDAGID dagId = TezDAGID.getInstance(appId, 1);
    TezVertexID srcVertexId = TezVertexID.getInstance(dagId, 2);
    TezTaskID srcTaskId1 = TezTaskID.getInstance(srcVertexId, 3);
    Vertex srcVertex = mock(Vertex.class);
    Task srcTask1 = mock(Task.class);
    doReturn(TaskState.RUNNING).when(srcTask1).getState();
    doReturn(srcTask1).when(srcVertex).getTask(srcTaskId1.getId());
    when(appContext.getCurrentDAG().getVertex(any(String.class))).thenReturn(srcVertex);
    String srcVertexName = "srcVertexName";
    List<TezEvent> eventList = Lists.newLinkedList();
    // First Attempt send event
    TezTaskAttemptID srcTaskAttemptId11 = TezTaskAttemptID.getInstance(srcTaskId1, 1);
    EventMetaData sourceInfo11 = new EventMetaData(EventMetaData.EventProducerConsumerType.PROCESSOR, srcVertexName, null, srcTaskAttemptId11);
    InputInitializerEvent e1 = InputInitializerEvent.create("fakeVertex", "fakeInput", null);
    TezEvent te1 = new TezEvent(e1, sourceInfo11);
    eventList.add(te1);
    initializerWrapper.handleInputInitializerEvents(eventList);
    verify(initializer, never()).handleInputInitializerEvent(any(List.class));
    eventList.clear();
    // First attempt, Task success notification
    initializerWrapper.onTaskSucceeded(srcVertexName, srcTaskId1, srcTaskAttemptId11.getId());
    ArgumentCaptor<List> argumentCaptor = ArgumentCaptor.forClass(List.class);
    verify(initializer, times(1)).handleInputInitializerEvent(argumentCaptor.capture());
    List<InputInitializerEvent> invokedEvents = argumentCaptor.getValue();
    assertEquals(1, invokedEvents.size());
    reset(initializer);
    // 2nd attempt send event
    TezTaskAttemptID srcTaskAttemptId12 = TezTaskAttemptID.getInstance(srcTaskId1, 2);
    EventMetaData sourceInfo12 = new EventMetaData(EventMetaData.EventProducerConsumerType.PROCESSOR, srcVertexName, null, srcTaskAttemptId12);
    InputInitializerEvent e2 = InputInitializerEvent.create("fakeVertex", "fakeInput", null);
    TezEvent te2 = new TezEvent(e2, sourceInfo12);
    eventList.add(te2);
    initializerWrapper.handleInputInitializerEvents(eventList);
    verify(initializer, never()).handleInputInitializerEvent(any(List.class));
    eventList.clear();
    reset(initializer);
    // 2nd attempt succeeded
    initializerWrapper.onTaskSucceeded(srcVertexName, srcTaskId1, srcTaskAttemptId12.getId());
    verify(initializer, never()).handleInputInitializerEvent(argumentCaptor.capture());
}
Also used : InputDescriptor(org.apache.tez.dag.api.InputDescriptor) InputInitializer(org.apache.tez.runtime.api.InputInitializer) TezDAGID(org.apache.tez.dag.records.TezDAGID) List(java.util.List) TezVertexID(org.apache.tez.dag.records.TezVertexID) EventMetaData(org.apache.tez.runtime.api.impl.EventMetaData) RootInputLeafOutput(org.apache.tez.dag.api.RootInputLeafOutput) AppContext(org.apache.tez.dag.app.AppContext) InputInitializerContext(org.apache.tez.runtime.api.InputInitializerContext) TezTaskID(org.apache.tez.dag.records.TezTaskID) InputInitializerEvent(org.apache.tez.runtime.api.events.InputInitializerEvent) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) TezEvent(org.apache.tez.runtime.api.impl.TezEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) TezTaskAttemptID(org.apache.tez.dag.records.TezTaskAttemptID) Test(org.junit.Test)

Aggregations

InputDescriptor (org.apache.tez.dag.api.InputDescriptor)37 Test (org.junit.Test)18 InputInitializerDescriptor (org.apache.tez.dag.api.InputInitializerDescriptor)11 OutputDescriptor (org.apache.tez.dag.api.OutputDescriptor)10 InputContext (org.apache.tez.runtime.api.InputContext)10 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)9 Configuration (org.apache.hadoop.conf.Configuration)8 OutputContext (org.apache.tez.runtime.api.OutputContext)8 UserPayload (org.apache.tez.dag.api.UserPayload)7 LinkedList (java.util.LinkedList)5 WeightedScalingMemoryDistributor (org.apache.tez.runtime.library.resources.WeightedScalingMemoryDistributor)5 DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)4 ProcessorDescriptor (org.apache.tez.dag.api.ProcessorDescriptor)4 TezVertexID (org.apache.tez.dag.records.TezVertexID)4 InputSpec (org.apache.tez.runtime.api.impl.InputSpec)4 TezEvent (org.apache.tez.runtime.api.impl.TezEvent)4 IOException (java.io.IOException)3 List (java.util.List)3 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)3 RootInputLeafOutput (org.apache.tez.dag.api.RootInputLeafOutput)3