Examples with DataSourceDescriptor - org.apache.tez.dag.api.DataSourceDescriptor

Example 21 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class TestMRInputHelpers method testInputSplitLocalResourceCreation.

@Test(timeout = 5000)
public void testInputSplitLocalResourceCreation() throws Exception {
    DataSourceDescriptor dataSource = generateDataSourceDescriptorMapRed(oldSplitsDir);
    Map<String, LocalResource> localResources = dataSource.getAdditionalLocalFiles();
    Assert.assertEquals(2, localResources.size());
    Assert.assertTrue(localResources.containsKey(MRInputHelpers.JOB_SPLIT_RESOURCE_NAME));
    Assert.assertTrue(localResources.containsKey(MRInputHelpers.JOB_SPLIT_METAINFO_RESOURCE_NAME));
}

Also used : DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) LocalResource(org.apache.hadoop.yarn.api.records.LocalResource) Test(org.junit.Test)

Example 22 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class TestMRInput method testAttributesInJobConf.

@Test(timeout = 5000)
public void testAttributesInJobConf() throws Exception {
    InputContext inputContext = mock(InputContext.class);
    doReturn(TEST_ATTRIBUTES_DAG_INDEX).when(inputContext).getDagIdentifier();
    doReturn(TEST_ATTRIBUTES_VERTEX_INDEX).when(inputContext).getTaskVertexIndex();
    doReturn(TEST_ATTRIBUTES_TASK_INDEX).when(inputContext).getTaskIndex();
    doReturn(TEST_ATTRIBUTES_TASK_ATTEMPT_INDEX).when(inputContext).getTaskAttemptNumber();
    doReturn(TEST_ATTRIBUTES_INPUT_INDEX).when(inputContext).getInputIndex();
    doReturn(TEST_ATTRIBUTES_DAG_ATTEMPT_NUMBER).when(inputContext).getDAGAttemptNumber();
    doReturn(TEST_ATTRIBUTES_DAG_NAME).when(inputContext).getDAGName();
    doReturn(TEST_ATTRIBUTES_VERTEX_NAME).when(inputContext).getTaskVertexName();
    doReturn(TEST_ATTRIBUTES_INPUT_NAME).when(inputContext).getSourceVertexName();
    doReturn(TEST_ATTRIBUTES_APPLICATION_ID).when(inputContext).getApplicationId();
    doReturn(TEST_ATTRIBUTES_UNIQUE_IDENTIFIER).when(inputContext).getUniqueIdentifier();
    DataSourceDescriptor dsd = MRInput.createConfigBuilder(new Configuration(false), TestInputFormat.class).groupSplits(false).build();
    doReturn(dsd.getInputDescriptor().getUserPayload()).when(inputContext).getUserPayload();
    doReturn(new TezCounters()).when(inputContext).getCounters();
    MRInput mrInput = new MRInput(inputContext, 1);
    mrInput.initialize();
    MRRuntimeProtos.MRSplitProto splitProto = MRRuntimeProtos.MRSplitProto.newBuilder().setSplitClassName(TestInputSplit.class.getName()).build();
    InputDataInformationEvent diEvent = InputDataInformationEvent.createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer());
    List<Event> events = new LinkedList<>();
    events.add(diEvent);
    mrInput.handleEvents(events);
    TezCounter counter = mrInput.getContext().getCounters().findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES);
    assertEquals(counter.getValue(), TestInputSplit.length);
    assertTrue(TestInputFormat.invoked.get());
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) InputContext(org.apache.tez.runtime.api.InputContext) Event(org.apache.tez.runtime.api.Event) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) TezCounter(org.apache.tez.common.counters.TezCounter) MRRuntimeProtos(org.apache.tez.mapreduce.protos.MRRuntimeProtos) TezCounters(org.apache.tez.common.counters.TezCounters) LinkedList(java.util.LinkedList) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor) InputDataInformationEvent(org.apache.tez.runtime.api.events.InputDataInformationEvent) Test(org.junit.Test)

Example 23 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project tez by apache.

the class TestHistoryParser method runWordCount.

private String runWordCount(String tokenizerProcessor, String summationProcessor, String dagName, boolean withTimeline) throws Exception {
    // HDFS path
    Path outputLoc = new Path("/tmp/outPath_" + System.currentTimeMillis());
    DataSourceDescriptor dataSource = MRInput.createConfigBuilder(conf, TextInputFormat.class, inputLoc.toString()).build();
    DataSinkDescriptor dataSink = MROutput.createConfigBuilder(conf, TextOutputFormat.class, outputLoc.toString()).build();
    Vertex tokenizerVertex = Vertex.create(TOKENIZER, ProcessorDescriptor.create(tokenizerProcessor)).addDataSource(INPUT, dataSource);
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig.newBuilder(Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()).build();
    Vertex summationVertex = Vertex.create(SUMMATION, ProcessorDescriptor.create(summationProcessor), 1).addDataSink(OUTPUT, dataSink);
    // Create DAG and add the vertices. Connect the producer and consumer vertices via the edge
    DAG dag = DAG.create(dagName);
    dag.addVertex(tokenizerVertex).addVertex(summationVertex).addEdge(Edge.create(tokenizerVertex, summationVertex, edgeConf.createDefaultEdgeProperty()));
    TezClient tezClient = getTezClient(withTimeline);
    // Update Caller Context
    CallerContext callerContext = CallerContext.create("TezExamples", "Tez WordCount Example Job");
    ApplicationId appId = tezClient.getAppMasterApplicationId();
    if (appId == null) {
        appId = ApplicationId.newInstance(1001l, 1);
    }
    callerContext.setCallerIdAndType(appId.toString(), "TezApplication");
    dag.setCallerContext(callerContext);
    DAGClient client = tezClient.submitDAG(dag);
    client.waitForCompletionWithStatusUpdates(Sets.newHashSet(StatusGetOpts.GET_COUNTERS));
    TezDAGID tezDAGID = TezDAGID.getInstance(tezClient.getAppMasterApplicationId(), 1);
    if (tezClient != null) {
        tezClient.stop();
    }
    return tezDAGID.toString();
}

Also used : Path(org.apache.hadoop.fs.Path) OrderedPartitionedKVEdgeConfig(org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig) Vertex(org.apache.tez.dag.api.Vertex) CallerContext(org.apache.tez.client.CallerContext) DAG(org.apache.tez.dag.api.DAG) DataSinkDescriptor(org.apache.tez.dag.api.DataSinkDescriptor) TezClient(org.apache.tez.client.TezClient) TextInputFormat(org.apache.hadoop.mapreduce.lib.input.TextInputFormat) TextOutputFormat(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat) TezDAGID(org.apache.tez.dag.records.TezDAGID) DAGClient(org.apache.tez.dag.api.client.DAGClient) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Example 24 with DataSourceDescriptor

use of org.apache.tez.dag.api.DataSourceDescriptor in project hive by apache.

the class DagUtils method createVertexFromMapWork.

/*
   * Helper function to create Vertex from MapWork.
   */
private Vertex createVertexFromMapWork(JobConf conf, MapWork mapWork, Path mrScratchDir, VertexType vertexType) throws Exception {
    // set up the operator plan
    Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
    // create the directories FileSinkOperators need
    Utilities.createTmpDirs(conf, mapWork);
    // finally create the vertex
    Vertex map = null;
    // use tez to combine splits
    boolean groupSplitsInInputInitializer;
    DataSourceDescriptor dataSource;
    int numTasks = -1;
    @SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
    boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
    LOG.info("Vertex has custom input? " + vertexHasCustomInput);
    if (vertexHasCustomInput) {
        groupSplitsInInputInitializer = false;
        // it will be enabled in the CustomVertex.
        if (inputFormatClass != BucketizedHiveInputFormat.class && inputFormatClass != HiveInputFormat.class) {
            // As of now only these two formats are supported.
            inputFormatClass = HiveInputFormat.class;
        }
        conf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
        // mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
        // this plug-in to avoid getting a serialized event at run-time.
        conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
    } else {
        // is HiveInputFormat
        if (inputFormatClass == HiveInputFormat.class) {
            groupSplitsInInputInitializer = true;
        } else {
            groupSplitsInInputInitializer = false;
        }
    }
    if (mapWork instanceof MergeFileWork) {
        Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
        // prepare the tmp output directory. The output tmp directory should
        // exist before jobClose (before renaming after job completion)
        Path tempOutPath = Utilities.toTempPath(outputPath);
        try {
            FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
            if (!tmpOutFS.exists(tempOutPath)) {
                tmpOutFS.mkdirs(tempOutPath);
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
        }
    }
    // remember mapping of plan to input
    conf.set(Utilities.INPUT_NAME, mapWork.getName());
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
        // set up the operator plan. (before setting up splits on the AM)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
        // the correct plugin.
        if (groupSplitsInInputInitializer) {
            // Not setting a payload, since the MRInput payload is the same and can be accessed.
            InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
            dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
        } else {
            // Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
            if (vertexHasCustomInput && vertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
                // SMB Join.
                dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            } else {
                dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
            }
        }
    } else {
        // Setup client side split generation.
        // we need to set this, because with HS2 and client side split
        // generation we end up not finding the map work. This is
        // because of thread local madness (tez split generation is
        // multi-threaded - HS2 plan cache uses thread locals). Setting
        // VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
        // of the map work.
        conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
        conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
        InputSplitInfo inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(conf, false, 0);
        InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName());
        InputDescriptor inputDescriptor = InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(conf)).setSplits(inputSplitInfo.getSplitsProto()).build().toByteString().asReadOnlyByteBuffer()));
        dataSource = DataSourceDescriptor.create(inputDescriptor, descriptor, null);
        numTasks = inputSplitInfo.getNumTasks();
        // set up the operator plan. (after generating splits - that changes configs)
        Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
    }
    UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
    String procClassName = MapTezProcessor.class.getName();
    if (mapWork instanceof MergeFileWork) {
        procClassName = MergeFileTezProcessor.class.getName();
    }
    map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
    map.setTaskEnvironment(getContainerEnvironment(conf, true));
    assert mapWork.getAliasToWork().keySet().size() == 1;
    // Add the actual source input
    String alias = mapWork.getAliasToWork().keySet().iterator().next();
    map.addDataSource(alias, dataSource);
    return map;
}

Also used : Path(org.apache.hadoop.fs.Path) InputDescriptor(org.apache.tez.dag.api.InputDescriptor) Vertex(org.apache.tez.dag.api.Vertex) PreWarmVertex(org.apache.tez.dag.api.PreWarmVertex) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) UserPayload(org.apache.tez.dag.api.UserPayload) BucketizedHiveInputFormat(org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat) InputSplitInfo(org.apache.tez.mapreduce.hadoop.InputSplitInfo) IOException(java.io.IOException) CombineHiveInputFormat(org.apache.hadoop.hive.ql.io.CombineHiveInputFormat) BucketizedHiveInputFormat(org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) MRInputSplitDistributor(org.apache.tez.mapreduce.common.MRInputSplitDistributor) FileSystem(org.apache.hadoop.fs.FileSystem) InputInitializerDescriptor(org.apache.tez.dag.api.InputInitializerDescriptor) DataSourceDescriptor(org.apache.tez.dag.api.DataSourceDescriptor)

Aggregations

DataSourceDescriptor (org.apache.tez.dag.api.DataSourceDescriptor)24 Vertex (org.apache.tez.dag.api.Vertex)14 Configuration (org.apache.hadoop.conf.Configuration)10 Path (org.apache.hadoop.fs.Path)10 DAG (org.apache.tez.dag.api.DAG)10 UserPayload (org.apache.tez.dag.api.UserPayload)10 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)8 IOException (java.io.IOException)7 FileSystem (org.apache.hadoop.fs.FileSystem)7 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)7 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)7 Test (org.junit.Test)7 IntWritable (org.apache.hadoop.io.IntWritable)5 Text (org.apache.hadoop.io.Text)5 JobConf (org.apache.hadoop.mapred.JobConf)5 InputDescriptor (org.apache.tez.dag.api.InputDescriptor)5 InputInitializerDescriptor (org.apache.tez.dag.api.InputInitializerDescriptor)5 TezUncheckedException (org.apache.tez.dag.api.TezUncheckedException)5 OrderedPartitionedKVEdgeConfig (org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig)5 TezClient (org.apache.tez.client.TezClient)4