Search in sources :

Example 11 with OutputContext

use of org.apache.tez.runtime.api.OutputContext in project tez by apache.

the class TestUnorderedPartitionedKVWriter method baseTestWithFinalMergeDisabled.

@SuppressWarnings("unchecked")
private void baseTestWithFinalMergeDisabled(int numRecords, int numPartitions, Set<Integer> skippedPartitions, boolean shouldCompress) throws IOException, InterruptedException {
    PartitionerForTest partitioner = new PartitionerForTest();
    ApplicationId appId = ApplicationId.newInstance(10000000, 1);
    TezCounters counters = new TezCounters();
    String uniqueId = UUID.randomUUID().toString();
    int dagId = 1;
    String auxiliaryService = defaultConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId, auxiliaryService);
    Configuration conf = createConfiguration(outputContext, IntWritable.class, LongWritable.class, shouldCompress, -1);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, false);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, false);
    CompressionCodec codec = null;
    if (shouldCompress) {
        codec = new DefaultCodec();
        ((Configurable) codec).setConf(conf);
    }
    int numOutputs = numPartitions;
    long availableMemory = 2048;
    int numRecordsWritten = 0;
    UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numOutputs, availableMemory);
    int sizePerBuffer = kvWriter.sizePerBuffer;
    // IntW + LongW
    int sizePerRecord = 4 + 8;
    // Record + META_OVERHEAD
    int sizePerRecordWithOverhead = sizePerRecord + 12;
    BitSet partitionsWithData = new BitSet(numPartitions);
    IntWritable intWritable = new IntWritable();
    LongWritable longWritable = new LongWritable();
    for (int i = 0; i < numRecords; i++) {
        intWritable.set(i);
        longWritable.set(i);
        int partition = partitioner.getPartition(intWritable, longWritable, numOutputs);
        if (skippedPartitions != null && skippedPartitions.contains(partition)) {
            continue;
        }
        partitionsWithData.set(partition);
        kvWriter.write(intWritable, longWritable);
        numRecordsWritten++;
    }
    int recordsPerBuffer = sizePerBuffer / sizePerRecordWithOverhead;
    int numExpectedSpills = numRecordsWritten / recordsPerBuffer;
    ArgumentCaptor<List> eventCaptor = ArgumentCaptor.forClass(List.class);
    List<Event> lastEvents = kvWriter.close();
    if (numPartitions == 1) {
        assertEquals(true, kvWriter.skipBuffers);
    }
    // max events sent are spills + one VM event. If there are no spills, atleast empty
    // partitions would be sent out finally.
    int spills = Math.max(1, kvWriter.numSpills.get());
    // spills + VMEvent
    assertEquals((spills + 1), lastEvents.size());
    verify(outputContext, atMost(0)).sendEvents(eventCaptor.capture());
    for (int i = 0; i < lastEvents.size(); i++) {
        Event event = lastEvents.get(i);
        if (event instanceof VertexManagerEvent) {
            // to stats.
            if (numRecordsWritten > 0) {
                verifyPartitionStats(((VertexManagerEvent) event), partitionsWithData);
            }
        }
    }
    verify(outputContext, never()).reportFailure(any(TaskFailureType.class), any(Throwable.class), any(String.class));
    assertNull(kvWriter.currentBuffer);
    assertEquals(0, kvWriter.availableBuffers.size());
    // Verify the counters
    TezCounter outputRecordBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES);
    TezCounter outputRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_RECORDS);
    TezCounter outputBytesWithOverheadCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_WITH_OVERHEAD);
    TezCounter fileOutputBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_PHYSICAL);
    TezCounter spilledRecordsCounter = counters.findCounter(TaskCounter.SPILLED_RECORDS);
    TezCounter additionalSpillBytesWritternCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_WRITTEN);
    TezCounter additionalSpillBytesReadCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_READ);
    TezCounter numAdditionalSpillsCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILL_COUNT);
    assertEquals(numRecordsWritten * sizePerRecord, outputRecordBytesCounter.getValue());
    assertEquals(numRecordsWritten, outputRecordsCounter.getValue());
    if (outputRecordsCounter.getValue() > 0) {
        assertEquals(numRecordsWritten * sizePerRecordWithOverhead, outputBytesWithOverheadCounter.getValue());
    } else {
        assertEquals(0, outputBytesWithOverheadCounter.getValue());
    }
    long fileOutputBytes = fileOutputBytesCounter.getValue();
    if (numRecordsWritten > 0) {
        assertTrue(fileOutputBytes > 0);
        if (!shouldCompress) {
            assertTrue("fileOutputBytes=" + fileOutputBytes + ", outputRecordBytes=" + outputRecordBytesCounter.getValue(), fileOutputBytes > outputRecordBytesCounter.getValue());
        }
    } else {
        assertEquals(0, fileOutputBytes);
    }
    // due to multiple threads, buffers could be merged in chunks in scheduleSpill.
    assertTrue(recordsPerBuffer * numExpectedSpills >= spilledRecordsCounter.getValue());
    long additionalSpillBytesWritten = additionalSpillBytesWritternCounter.getValue();
    long additionalSpillBytesRead = additionalSpillBytesReadCounter.getValue();
    // No additional spill bytes written when final merge is disabled.
    assertEquals(additionalSpillBytesWritten, 0);
    // No additional spills when final merge is disabled.
    assertTrue(additionalSpillBytesWritten == additionalSpillBytesRead);
    // No additional spills when final merge is disabled.
    assertEquals(numAdditionalSpillsCounter.getValue(), 0);
    assertTrue(lastEvents.size() > 0);
    // Get the last event
    int index = lastEvents.size() - 1;
    assertTrue(lastEvents.get(index) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) lastEvents.get(index);
    assertEquals(0, cdme.getSourceIndexStart());
    assertEquals(numOutputs, cdme.getCount());
    DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
    verifyEmptyPartitions(eventProto, numRecordsWritten, numPartitions, skippedPartitions);
    if (outputRecordsCounter.getValue() > 0) {
        // Ensure that this is the last event
        assertTrue(eventProto.getLastEvent());
    }
    // Verify if all path components have spillIds when final merge is disabled
    Pattern mergePathComponentPattern = Pattern.compile("(.*)(_\\d+)");
    for (Event event : lastEvents) {
        if (!(event instanceof CompositeDataMovementEvent)) {
            continue;
        }
        cdme = (CompositeDataMovementEvent) event;
        eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
        assertEquals(false, eventProto.getPipelined());
        if (eventProto.hasPathComponent()) {
            // for final merge disabled cases, it should have _spillId
            Matcher matcher = mergePathComponentPattern.matcher(eventProto.getPathComponent());
            assertTrue("spill id should be present in path component " + eventProto.getPathComponent(), matcher.matches());
            assertEquals(2, matcher.groupCount());
            assertEquals(uniqueId, matcher.group(1));
            assertTrue("spill id should be present in path component", matcher.group(2) != null);
            Path outputPath = new Path(outputContext.getWorkDirs()[0], "output/" + eventProto.getPathComponent() + "/" + Constants.TEZ_RUNTIME_TASK_OUTPUT_FILENAME_STRING);
            Path indexPath = outputPath.suffix(Constants.TEZ_RUNTIME_TASK_OUTPUT_INDEX_SUFFIX_STRING);
            assertEquals("Incorrect output permissions", (short) 0640, localFs.getFileStatus(outputPath).getPermission().toShort());
            assertEquals("Incorrect index permissions", (short) 0640, localFs.getFileStatus(indexPath).getPermission().toShort());
        } else {
            assertEquals(0, eventProto.getSpillId());
            if (outputRecordsCounter.getValue() > 0) {
                assertEquals(true, eventProto.getLastEvent());
            } else {
                byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(eventProto.getEmptyPartitions());
                BitSet emptyPartitionBits = TezUtilsInternal.fromByteArray(emptyPartitions);
                assertEquals(numPartitions, emptyPartitionBits.cardinality());
            }
        }
    }
    verify(outputContext, atLeast(1)).notifyProgress();
    // Verify if all spill files are available.
    TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId, dagId);
    if (numRecordsWritten > 0) {
        int numSpills = kvWriter.numSpills.get();
        for (int i = 0; i < numSpills; i++) {
            assertTrue(localFs.exists(taskOutput.getSpillFileForWrite(i, 10)));
            assertTrue(localFs.exists(taskOutput.getSpillIndexFileForWrite(i, 10)));
        }
    } else {
        return;
    }
}
Also used : TezTaskOutputFiles(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) Matcher(java.util.regex.Matcher) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) ByteString(com.google.protobuf.ByteString) Configurable(org.apache.hadoop.conf.Configurable) TezCounter(org.apache.tez.common.counters.TezCounter) List(java.util.List) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) Pattern(java.util.regex.Pattern) BitSet(java.util.BitSet) TezCounters(org.apache.tez.common.counters.TezCounters) OutputContext(org.apache.tez.runtime.api.OutputContext) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) TaskFailureType(org.apache.tez.runtime.api.TaskFailureType) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) TezTaskOutput(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutput)

Example 12 with OutputContext

use of org.apache.tez.runtime.api.OutputContext in project tez by apache.

the class TestOnFileUnorderedKVOutput method testWithPipelinedShuffle.

@Test(timeout = 30000)
@SuppressWarnings("unchecked")
public void testWithPipelinedShuffle() throws Exception {
    Configuration conf = new Configuration();
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, Text.class.getName());
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, IntWritable.class.getName());
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, true);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, false);
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_OUTPUT_BUFFER_SIZE_MB, 1);
    TezSharedExecutor sharedExecutor = new TezSharedExecutor(conf);
    OutputContext outputContext = createOutputContext(conf, sharedExecutor);
    UnorderedKVOutput kvOutput = new UnorderedKVOutput(outputContext, 1);
    List<Event> events = null;
    events = kvOutput.initialize();
    kvOutput.start();
    assertTrue(events != null && events.size() == 0);
    KeyValueWriter kvWriter = kvOutput.getWriter();
    for (int i = 0; i < 500; i++) {
        kvWriter.write(new Text(RandomStringUtils.randomAscii(10000)), new IntWritable(i));
    }
    events = kvOutput.close();
    // When pipelining is on, all events are sent out within writer itself.
    assertTrue(events != null && events.size() == 0);
    // ensure that data is sent via outputContext.
    ArgumentCaptor<List> eventsCaptor = ArgumentCaptor.forClass(List.class);
    verify(outputContext, atLeast(1)).sendEvents(eventsCaptor.capture());
    events = eventsCaptor.getValue();
    CompositeDataMovementEvent dmEvent = (CompositeDataMovementEvent) events.get(1);
    assertEquals("Invalid source index", 0, dmEvent.getSourceIndexStart());
    DataMovementEventPayloadProto shufflePayload = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(dmEvent.getUserPayload()));
    assertTrue(shufflePayload.hasLastEvent());
    assertFalse(shufflePayload.hasEmptyPartitions());
    assertEquals(shufflePort, shufflePayload.getPort());
    assertEquals("localhost", shufflePayload.getHost());
    sharedExecutor.shutdownNow();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) Text(org.apache.hadoop.io.Text) OutputContext(org.apache.tez.runtime.api.OutputContext) KeyValueWriter(org.apache.tez.runtime.library.api.KeyValueWriter) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) TezSharedExecutor(org.apache.tez.common.TezSharedExecutor) Event(org.apache.tez.runtime.api.Event) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) List(java.util.List) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 13 with OutputContext

use of org.apache.tez.runtime.api.OutputContext in project tez by apache.

the class TestOnFileUnorderedKVOutput method createOutputContext.

private OutputContext createOutputContext(Configuration conf, TezSharedExecutor sharedExecutor) throws IOException {
    int appAttemptNumber = 1;
    TezUmbilical tezUmbilical = mock(TezUmbilical.class);
    String dagName = "currentDAG";
    String taskVertexName = "currentVertex";
    String destinationVertexName = "destinationVertex";
    TezDAGID dagID = TezDAGID.getInstance("2000", 1, 1);
    TezVertexID vertexID = TezVertexID.getInstance(dagID, 1);
    TezTaskID taskID = TezTaskID.getInstance(vertexID, 1);
    TezTaskAttemptID taskAttemptID = TezTaskAttemptID.getInstance(taskID, 1);
    UserPayload userPayload = TezUtils.createUserPayloadFromConf(conf);
    TaskSpec mockSpec = mock(TaskSpec.class);
    when(mockSpec.getInputs()).thenReturn(Collections.singletonList(mock(InputSpec.class)));
    when(mockSpec.getOutputs()).thenReturn(Collections.singletonList(mock(OutputSpec.class)));
    task = new LogicalIOProcessorRuntimeTask(mockSpec, appAttemptNumber, new Configuration(), new String[] { "/" }, tezUmbilical, null, null, null, null, "", null, 1024, false, new DefaultHadoopShim(), sharedExecutor);
    LogicalIOProcessorRuntimeTask runtimeTask = spy(task);
    Map<String, String> auxEnv = new HashMap<String, String>();
    ByteBuffer bb = ByteBuffer.allocate(4);
    bb.putInt(shufflePort);
    bb.position(0);
    AuxiliaryServiceHelper.setServiceDataIntoEnv(conf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT), bb, auxEnv);
    OutputDescriptor outputDescriptor = mock(OutputDescriptor.class);
    when(outputDescriptor.getClassName()).thenReturn("OutputDescriptor");
    OutputContext realOutputContext = new TezOutputContextImpl(conf, new String[] { workDir.toString() }, appAttemptNumber, tezUmbilical, dagName, taskVertexName, destinationVertexName, -1, taskAttemptID, 0, userPayload, runtimeTask, null, auxEnv, new MemoryDistributor(1, 1, conf), outputDescriptor, null, new ExecutionContextImpl("localhost"), 2048, new TezSharedExecutor(defaultConf));
    verify(runtimeTask, times(1)).addAndGetTezCounter(destinationVertexName);
    verify(runtimeTask, times(1)).getTaskStatistics();
    // verify output stats object got created
    Assert.assertTrue(task.getTaskStatistics().getIOStatistics().containsKey(destinationVertexName));
    OutputContext outputContext = spy(realOutputContext);
    doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            long requestedSize = (Long) invocation.getArguments()[0];
            MemoryUpdateCallbackHandler callback = (MemoryUpdateCallbackHandler) invocation.getArguments()[1];
            callback.memoryAssigned(requestedSize);
            return null;
        }
    }).when(outputContext).requestInitialMemory(anyLong(), any(MemoryUpdateCallback.class));
    return outputContext;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) HashMap(java.util.HashMap) ByteString(com.google.protobuf.ByteString) DefaultHadoopShim(org.apache.tez.hadoop.shim.DefaultHadoopShim) OutputDescriptor(org.apache.tez.dag.api.OutputDescriptor) TezDAGID(org.apache.tez.dag.records.TezDAGID) MemoryUpdateCallbackHandler(org.apache.tez.runtime.library.common.MemoryUpdateCallbackHandler) MemoryUpdateCallback(org.apache.tez.runtime.api.MemoryUpdateCallback) TezVertexID(org.apache.tez.dag.records.TezVertexID) LogicalIOProcessorRuntimeTask(org.apache.tez.runtime.LogicalIOProcessorRuntimeTask) TezOutputContextImpl(org.apache.tez.runtime.api.impl.TezOutputContextImpl) UserPayload(org.apache.tez.dag.api.UserPayload) ExecutionContextImpl(org.apache.tez.runtime.api.impl.ExecutionContextImpl) TaskSpec(org.apache.tez.runtime.api.impl.TaskSpec) ByteBuffer(java.nio.ByteBuffer) TezTaskID(org.apache.tez.dag.records.TezTaskID) OutputContext(org.apache.tez.runtime.api.OutputContext) Mockito.doAnswer(org.mockito.Mockito.doAnswer) Answer(org.mockito.stubbing.Answer) InvocationOnMock(org.mockito.invocation.InvocationOnMock) TezSharedExecutor(org.apache.tez.common.TezSharedExecutor) TezUmbilical(org.apache.tez.runtime.api.impl.TezUmbilical) MemoryDistributor(org.apache.tez.runtime.common.resources.MemoryDistributor) TezTaskAttemptID(org.apache.tez.dag.records.TezTaskAttemptID)

Example 14 with OutputContext

use of org.apache.tez.runtime.api.OutputContext in project tez by apache.

the class TestOnFileSortedOutput method _testPipelinedShuffle.

private void _testPipelinedShuffle(String sorterName) throws Exception {
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 3);
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_SORTER_CLASS, sorterName);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, false);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, true);
    OutputContext context = createTezOutputContext();
    UserPayload payLoad = TezUtils.createUserPayloadFromConf(conf);
    doReturn(payLoad).when(context).getUserPayload();
    sortedOutput = new OrderedPartitionedKVOutput(context, partitions);
    sortedOutput.initialize();
    sortedOutput.start();
    assertFalse(sortedOutput.finalMergeEnabled);
    assertTrue(sortedOutput.pipelinedShuffle);
}
Also used : UserPayload(org.apache.tez.dag.api.UserPayload) OutputContext(org.apache.tez.runtime.api.OutputContext)

Example 15 with OutputContext

use of org.apache.tez.runtime.api.OutputContext in project tez by apache.

the class TestOnFileSortedOutput method startSortedOutput.

private void startSortedOutput(int partitions) throws Exception {
    OutputContext context = createTezOutputContext();
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 4);
    UserPayload payLoad = TezUtils.createUserPayloadFromConf(conf);
    doReturn(payLoad).when(context).getUserPayload();
    sortedOutput = new OrderedPartitionedKVOutput(context, partitions);
    sortedOutput.initialize();
    sortedOutput.start();
    writer = sortedOutput.getWriter();
}
Also used : UserPayload(org.apache.tez.dag.api.UserPayload) OutputContext(org.apache.tez.runtime.api.OutputContext)

Aggregations

OutputContext (org.apache.tez.runtime.api.OutputContext)61 Test (org.junit.Test)38 Configuration (org.apache.hadoop.conf.Configuration)19 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)15 MemoryUpdateCallbackHandler (org.apache.tez.runtime.library.common.MemoryUpdateCallbackHandler)14 TezCounters (org.apache.tez.common.counters.TezCounters)13 OutputDescriptor (org.apache.tez.dag.api.OutputDescriptor)13 UserPayload (org.apache.tez.dag.api.UserPayload)13 Path (org.apache.hadoop.fs.Path)12 Event (org.apache.tez.runtime.api.Event)12 ByteString (com.google.protobuf.ByteString)11 DataSinkDescriptor (org.apache.tez.dag.api.DataSinkDescriptor)11 CompositeDataMovementEvent (org.apache.tez.runtime.api.events.CompositeDataMovementEvent)11 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)9 InputContext (org.apache.tez.runtime.api.InputContext)9 BitSet (java.util.BitSet)8 InputDescriptor (org.apache.tez.dag.api.InputDescriptor)8 TezRuntimeConfiguration (org.apache.tez.runtime.library.api.TezRuntimeConfiguration)8 ByteBuffer (java.nio.ByteBuffer)6 Text (org.apache.hadoop.io.Text)6