Search in sources :

Example 1 with TezSpillRecord

use of org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord in project tez by apache.

the class TestShuffleUtils method testGenerateOnSpillEvent_With_All_EmptyPartitions.

@Test
public void testGenerateOnSpillEvent_With_All_EmptyPartitions() throws Exception {
    List<Event> events = Lists.newLinkedList();
    // Create an index file with all empty partitions
    Path indexFile = createIndexFile(10, true);
    boolean finalMergeDisabled = false;
    boolean isLastEvent = true;
    int spillId = 0;
    int physicalOutputs = 10;
    String pathComponent = "/attempt_x_y_0/file.out";
    String auxiliaryService = conf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    // normal code path where we do final merge all the time
    ShuffleUtils.generateEventOnSpill(events, finalMergeDisabled, isLastEvent, outputContext, spillId, new TezSpillRecord(indexFile, conf), physicalOutputs, true, pathComponent, null, false, auxiliaryService, TezCommonUtils.newBestCompressionDeflater());
    // one for VM
    Assert.assertTrue(events.size() == 2);
    Assert.assertTrue(events.get(0) instanceof VertexManagerEvent);
    Assert.assertTrue(events.get(1) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) events.get(1);
    Assert.assertTrue(cdme.getCount() == physicalOutputs);
    Assert.assertTrue(cdme.getSourceIndexStart() == 0);
    ShuffleUserPayloads.DataMovementEventPayloadProto dmeProto = ShuffleUserPayloads.DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
    // spill details should be present
    Assert.assertTrue(dmeProto.getSpillId() == 0);
    Assert.assertTrue(dmeProto.hasLastEvent() && dmeProto.getLastEvent());
    Assert.assertTrue(dmeProto.getPathComponent().equals(""));
    byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(dmeProto.getEmptyPartitions());
    BitSet emptyPartitionsBitSet = TezUtilsInternal.fromByteArray(emptyPartitions);
    Assert.assertTrue("emptyPartitionBitSet cardinality (expecting 10) = " + emptyPartitionsBitSet.cardinality(), emptyPartitionsBitSet.cardinality() == 10);
}
Also used : Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) ByteString(com.google.protobuf.ByteString) Mockito.anyString(org.mockito.Mockito.anyString) TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) ShuffleUserPayloads(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads) Test(org.junit.Test)

Example 2 with TezSpillRecord

use of org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord in project tez by apache.

the class TestShuffleUtils method testGenerateOnSpillEvent.

@Test
public void testGenerateOnSpillEvent() throws Exception {
    List<Event> events = Lists.newLinkedList();
    Path indexFile = createIndexFile(10, false);
    boolean finalMergeEnabled = false;
    boolean isLastEvent = false;
    int spillId = 0;
    int physicalOutputs = 10;
    String pathComponent = "/attempt_x_y_0/file.out";
    String auxiliaryService = conf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    ShuffleUtils.generateEventOnSpill(events, finalMergeEnabled, isLastEvent, outputContext, spillId, new TezSpillRecord(indexFile, conf), physicalOutputs, true, pathComponent, null, false, auxiliaryService, TezCommonUtils.newBestCompressionDeflater());
    Assert.assertTrue(events.size() == 1);
    Assert.assertTrue(events.get(0) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) events.get(0);
    Assert.assertTrue(cdme.getCount() == physicalOutputs);
    Assert.assertTrue(cdme.getSourceIndexStart() == 0);
    ByteBuffer payload = cdme.getUserPayload();
    ShuffleUserPayloads.DataMovementEventPayloadProto dmeProto = ShuffleUserPayloads.DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(payload));
    Assert.assertTrue(dmeProto.getSpillId() == 0);
    Assert.assertTrue(dmeProto.hasLastEvent() && !dmeProto.getLastEvent());
    byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(dmeProto.getEmptyPartitions());
    BitSet emptyPartitionsBitSet = TezUtilsInternal.fromByteArray(emptyPartitions);
    Assert.assertTrue("emptyPartitionBitSet cardinality (expecting 5) = " + emptyPartitionsBitSet.cardinality(), emptyPartitionsBitSet.cardinality() == 5);
    events.clear();
}
Also used : Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) ByteString(com.google.protobuf.ByteString) Mockito.anyString(org.mockito.Mockito.anyString) ByteBuffer(java.nio.ByteBuffer) TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) ShuffleUserPayloads(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads) Test(org.junit.Test)

Example 3 with TezSpillRecord

use of org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord in project tez by apache.

the class TestUnorderedPartitionedKVWriter method textTest.

public void textTest(int numRegularRecords, int numPartitions, long availableMemory, int numLargeKeys, int numLargevalues, int numLargeKvPairs, boolean pipeliningEnabled, boolean isFinalMergeEnabled) throws IOException, InterruptedException {
    Partitioner partitioner = new HashPartitioner();
    ApplicationId appId = ApplicationId.newInstance(10000000, 1);
    TezCounters counters = new TezCounters();
    String uniqueId = UUID.randomUUID().toString();
    int dagId = 1;
    String auxiliaryService = defaultConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId, auxiliaryService);
    Random random = new Random();
    Configuration conf = createConfiguration(outputContext, Text.class, Text.class, shouldCompress, -1, HashPartitioner.class);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, pipeliningEnabled);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, isFinalMergeEnabled);
    CompressionCodec codec = null;
    if (shouldCompress) {
        codec = new DefaultCodec();
        ((Configurable) codec).setConf(conf);
    }
    int numRecordsWritten = 0;
    Map<Integer, Multimap<String, String>> expectedValues = new HashMap<Integer, Multimap<String, String>>();
    for (int i = 0; i < numPartitions; i++) {
        expectedValues.put(i, LinkedListMultimap.<String, String>create());
    }
    UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numPartitions, availableMemory);
    int sizePerBuffer = kvWriter.sizePerBuffer;
    BitSet partitionsWithData = new BitSet(numPartitions);
    Text keyText = new Text();
    Text valText = new Text();
    for (int i = 0; i < numRegularRecords; i++) {
        String key = createRandomString(Math.abs(random.nextInt(10)));
        String val = createRandomString(Math.abs(random.nextInt(20)));
        keyText.set(key);
        valText.set(val);
        int partition = partitioner.getPartition(keyText, valText, numPartitions);
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(key, val);
        kvWriter.write(keyText, valText);
        numRecordsWritten++;
    }
    // Write Large key records
    for (int i = 0; i < numLargeKeys; i++) {
        String key = createRandomString(sizePerBuffer + Math.abs(random.nextInt(100)));
        String val = createRandomString(Math.abs(random.nextInt(20)));
        keyText.set(key);
        valText.set(val);
        int partition = partitioner.getPartition(keyText, valText, numPartitions);
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(key, val);
        kvWriter.write(keyText, valText);
        numRecordsWritten++;
    }
    if (pipeliningEnabled) {
        verify(outputContext, times(numLargeKeys)).sendEvents(anyListOf(Event.class));
    }
    // Write Large val records
    for (int i = 0; i < numLargevalues; i++) {
        String key = createRandomString(Math.abs(random.nextInt(10)));
        String val = createRandomString(sizePerBuffer + Math.abs(random.nextInt(100)));
        keyText.set(key);
        valText.set(val);
        int partition = partitioner.getPartition(keyText, valText, numPartitions);
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(key, val);
        kvWriter.write(keyText, valText);
        numRecordsWritten++;
    }
    if (pipeliningEnabled) {
        verify(outputContext, times(numLargevalues + numLargeKeys)).sendEvents(anyListOf(Event.class));
    }
    // Write records where key + val are large (but both can fit in the buffer individually)
    for (int i = 0; i < numLargeKvPairs; i++) {
        String key = createRandomString(sizePerBuffer / 2 + Math.abs(random.nextInt(100)));
        String val = createRandomString(sizePerBuffer / 2 + Math.abs(random.nextInt(100)));
        keyText.set(key);
        valText.set(val);
        int partition = partitioner.getPartition(keyText, valText, numPartitions);
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(key, val);
        kvWriter.write(keyText, valText);
        numRecordsWritten++;
    }
    if (pipeliningEnabled) {
        verify(outputContext, times(numLargevalues + numLargeKeys + numLargeKvPairs)).sendEvents(anyListOf(Event.class));
    }
    List<Event> events = kvWriter.close();
    verify(outputContext, never()).reportFailure(any(TaskFailureType.class), any(Throwable.class), any(String.class));
    if (!pipeliningEnabled) {
        VertexManagerEvent vmEvent = null;
        for (Event event : events) {
            if (event instanceof VertexManagerEvent) {
                assertNull(vmEvent);
                vmEvent = (VertexManagerEvent) event;
            }
        }
        VertexManagerEventPayloadProto vmEventPayload = VertexManagerEventPayloadProto.parseFrom(ByteString.copyFrom(vmEvent.getUserPayload().asReadOnlyBuffer()));
        assertEquals(numRecordsWritten, vmEventPayload.getNumRecord());
    }
    TezCounter outputLargeRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_LARGE_RECORDS);
    assertEquals(numLargeKeys + numLargevalues + numLargeKvPairs, outputLargeRecordsCounter.getValue());
    if (pipeliningEnabled || !isFinalMergeEnabled) {
        // verify spill data files and index file exist
        for (int i = 0; i < kvWriter.numSpills.get(); i++) {
            assertTrue(localFs.exists(kvWriter.outputFileHandler.getSpillFileForWrite(i, 0)));
            assertTrue(localFs.exists(kvWriter.outputFileHandler.getSpillIndexFileForWrite(i, 0)));
        }
        return;
    }
    // Validate the events
    assertEquals(2, events.size());
    assertTrue(events.get(0) instanceof VertexManagerEvent);
    VertexManagerEvent vme = (VertexManagerEvent) events.get(0);
    verifyPartitionStats(vme, partitionsWithData);
    assertTrue(events.get(1) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) events.get(1);
    assertEquals(0, cdme.getSourceIndexStart());
    assertEquals(numPartitions, cdme.getCount());
    DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
    BitSet emptyPartitionBits = null;
    if (partitionsWithData.cardinality() != numPartitions) {
        assertTrue(eventProto.hasEmptyPartitions());
        byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(eventProto.getEmptyPartitions());
        emptyPartitionBits = TezUtilsInternal.fromByteArray(emptyPartitions);
        assertEquals(numPartitions - partitionsWithData.cardinality(), emptyPartitionBits.cardinality());
    } else {
        assertFalse(eventProto.hasEmptyPartitions());
        emptyPartitionBits = new BitSet(numPartitions);
    }
    assertEquals(HOST_STRING, eventProto.getHost());
    assertEquals(SHUFFLE_PORT, eventProto.getPort());
    assertEquals(uniqueId, eventProto.getPathComponent());
    // Verify the data
    // Verify the actual data
    TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId, dagId);
    Path outputFilePath = kvWriter.finalOutPath;
    Path spillFilePath = kvWriter.finalIndexPath;
    if (numRecordsWritten > 0) {
        assertTrue(localFs.exists(outputFilePath));
        assertTrue(localFs.exists(spillFilePath));
        assertEquals("Incorrect output permissions", (short) 0640, localFs.getFileStatus(outputFilePath).getPermission().toShort());
        assertEquals("Incorrect index permissions", (short) 0640, localFs.getFileStatus(spillFilePath).getPermission().toShort());
    } else {
        return;
    }
    // Special case for 0 records.
    TezSpillRecord spillRecord = new TezSpillRecord(spillFilePath, conf);
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataInputBuffer valBuffer = new DataInputBuffer();
    Text keyDeser = new Text();
    Text valDeser = new Text();
    for (int i = 0; i < numPartitions; i++) {
        if (emptyPartitionBits.get(i)) {
            continue;
        }
        TezIndexRecord indexRecord = spillRecord.getIndex(i);
        FSDataInputStream inStream = FileSystem.getLocal(conf).open(outputFilePath);
        inStream.seek(indexRecord.getStartOffset());
        IFile.Reader reader = new IFile.Reader(inStream, indexRecord.getPartLength(), codec, null, null, false, 0, -1);
        while (reader.nextRawKey(keyBuffer)) {
            reader.nextRawValue(valBuffer);
            keyDeser.readFields(keyBuffer);
            valDeser.readFields(valBuffer);
            int partition = partitioner.getPartition(keyDeser, valDeser, numPartitions);
            assertTrue(expectedValues.get(partition).remove(keyDeser.toString(), valDeser.toString()));
        }
        inStream.close();
    }
    for (int i = 0; i < numPartitions; i++) {
        assertEquals(0, expectedValues.get(i).size());
        expectedValues.remove(i);
    }
    assertEquals(0, expectedValues.size());
}
Also used : TezTaskOutputFiles(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles) IFile(org.apache.tez.runtime.library.common.sort.impl.IFile) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) HashMap(java.util.HashMap) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) ByteString(com.google.protobuf.ByteString) Configurable(org.apache.hadoop.conf.Configurable) TezCounter(org.apache.tez.common.counters.TezCounter) TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) Random(java.util.Random) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Partitioner(org.apache.tez.runtime.library.api.Partitioner) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) Text(org.apache.hadoop.io.Text) TezCounters(org.apache.tez.common.counters.TezCounters) OutputContext(org.apache.tez.runtime.api.OutputContext) LinkedListMultimap(com.google.common.collect.LinkedListMultimap) Multimap(com.google.common.collect.Multimap) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) TaskFailureType(org.apache.tez.runtime.api.TaskFailureType) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) TezIndexRecord(org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) VertexManagerEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.VertexManagerEventPayloadProto) TezTaskOutput(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutput)

Example 4 with TezSpillRecord

use of org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord in project tez by apache.

the class TestUnorderedPartitionedKVWriter method baseTest.

private void baseTest(int numRecords, int numPartitions, Set<Integer> skippedPartitions, boolean shouldCompress, int maxSingleBufferSizeBytes, int bufferMergePercent, int availableMemory) throws IOException, InterruptedException {
    PartitionerForTest partitioner = new PartitionerForTest();
    ApplicationId appId = ApplicationId.newInstance(10000000, 1);
    TezCounters counters = new TezCounters();
    String uniqueId = UUID.randomUUID().toString();
    int dagId = 1;
    String auxiliaryService = defaultConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId, auxiliaryService);
    Configuration conf = createConfiguration(outputContext, IntWritable.class, LongWritable.class, shouldCompress, maxSingleBufferSizeBytes);
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_PARTITIONED_KVWRITER_BUFFER_MERGE_PERCENT, bufferMergePercent);
    CompressionCodec codec = null;
    if (shouldCompress) {
        codec = new DefaultCodec();
        ((Configurable) codec).setConf(conf);
    }
    int numOutputs = numPartitions;
    int numRecordsWritten = 0;
    Map<Integer, Multimap<Integer, Long>> expectedValues = new HashMap<Integer, Multimap<Integer, Long>>();
    for (int i = 0; i < numOutputs; i++) {
        expectedValues.put(i, LinkedListMultimap.<Integer, Long>create());
    }
    UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numOutputs, availableMemory);
    int sizePerBuffer = kvWriter.sizePerBuffer;
    // IntW + LongW
    int sizePerRecord = 4 + 8;
    // Record + META_OVERHEAD
    int sizePerRecordWithOverhead = sizePerRecord + 12;
    IntWritable intWritable = new IntWritable();
    LongWritable longWritable = new LongWritable();
    BitSet partitionsWithData = new BitSet(numPartitions);
    for (int i = 0; i < numRecords; i++) {
        intWritable.set(i);
        longWritable.set(i);
        int partition = partitioner.getPartition(intWritable, longWritable, numOutputs);
        if (skippedPartitions != null && skippedPartitions.contains(partition)) {
            continue;
        }
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(intWritable.get(), longWritable.get());
        kvWriter.write(intWritable, longWritable);
        numRecordsWritten++;
    }
    List<Event> events = kvWriter.close();
    if (numPartitions == 1) {
        assertEquals(true, kvWriter.skipBuffers);
    }
    int recordsPerBuffer = sizePerBuffer / sizePerRecordWithOverhead;
    int numExpectedSpills = numRecordsWritten / recordsPerBuffer / kvWriter.spillLimit;
    verify(outputContext, never()).reportFailure(any(TaskFailureType.class), any(Throwable.class), any(String.class));
    assertNull(kvWriter.currentBuffer);
    assertEquals(0, kvWriter.availableBuffers.size());
    // Verify the counters
    TezCounter outputRecordBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES);
    TezCounter outputRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_RECORDS);
    TezCounter outputBytesWithOverheadCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_WITH_OVERHEAD);
    TezCounter fileOutputBytesCounter = counters.findCounter(TaskCounter.OUTPUT_BYTES_PHYSICAL);
    TezCounter spilledRecordsCounter = counters.findCounter(TaskCounter.SPILLED_RECORDS);
    TezCounter additionalSpillBytesWritternCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_WRITTEN);
    TezCounter additionalSpillBytesReadCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILLS_BYTES_READ);
    TezCounter numAdditionalSpillsCounter = counters.findCounter(TaskCounter.ADDITIONAL_SPILL_COUNT);
    assertEquals(numRecordsWritten * sizePerRecord, outputRecordBytesCounter.getValue());
    if (numPartitions > 1) {
        assertEquals(numRecordsWritten * sizePerRecordWithOverhead, outputBytesWithOverheadCounter.getValue());
    }
    assertEquals(numRecordsWritten, outputRecordsCounter.getValue());
    long fileOutputBytes = fileOutputBytesCounter.getValue();
    if (numRecordsWritten > 0) {
        assertTrue(fileOutputBytes > 0);
        if (!shouldCompress) {
            assertTrue(fileOutputBytes > outputRecordBytesCounter.getValue());
        }
    } else {
        assertEquals(0, fileOutputBytes);
    }
    assertEquals(recordsPerBuffer * numExpectedSpills, spilledRecordsCounter.getValue());
    long additionalSpillBytesWritten = additionalSpillBytesWritternCounter.getValue();
    long additionalSpillBytesRead = additionalSpillBytesReadCounter.getValue();
    if (numExpectedSpills == 0) {
        assertEquals(0, additionalSpillBytesWritten);
        assertEquals(0, additionalSpillBytesRead);
    } else {
        assertTrue(additionalSpillBytesWritten > 0);
        assertTrue(additionalSpillBytesRead > 0);
        if (!shouldCompress) {
            assertTrue(additionalSpillBytesWritten > (recordsPerBuffer * numExpectedSpills * sizePerRecord));
            assertTrue(additionalSpillBytesRead > (recordsPerBuffer * numExpectedSpills * sizePerRecord));
        }
    }
    assertEquals(additionalSpillBytesWritten, additionalSpillBytesRead);
    // due to multiple threads, buffers could be merged in chunks in scheduleSpill.
    assertTrue(numExpectedSpills >= numAdditionalSpillsCounter.getValue());
    BitSet emptyPartitionBits = null;
    // Verify the events returned
    assertEquals(2, events.size());
    assertTrue(events.get(0) instanceof VertexManagerEvent);
    VertexManagerEvent vme = (VertexManagerEvent) events.get(0);
    verifyPartitionStats(vme, partitionsWithData);
    assertTrue(events.get(1) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) events.get(1);
    assertEquals(0, cdme.getSourceIndexStart());
    assertEquals(numOutputs, cdme.getCount());
    DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
    if (skippedPartitions == null && numRecordsWritten > 0) {
        assertFalse(eventProto.hasEmptyPartitions());
        emptyPartitionBits = new BitSet(numPartitions);
    } else {
        assertTrue(eventProto.hasEmptyPartitions());
        byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(eventProto.getEmptyPartitions());
        emptyPartitionBits = TezUtilsInternal.fromByteArray(emptyPartitions);
        if (numRecordsWritten == 0) {
            assertEquals(numPartitions, emptyPartitionBits.cardinality());
        } else {
            for (Integer e : skippedPartitions) {
                assertTrue(emptyPartitionBits.get(e));
            }
            assertEquals(skippedPartitions.size(), emptyPartitionBits.cardinality());
        }
    }
    if (emptyPartitionBits.cardinality() != numPartitions) {
        assertEquals(HOST_STRING, eventProto.getHost());
        assertEquals(SHUFFLE_PORT, eventProto.getPort());
        assertEquals(uniqueId, eventProto.getPathComponent());
    } else {
        assertFalse(eventProto.hasHost());
        assertFalse(eventProto.hasPort());
        assertFalse(eventProto.hasPathComponent());
    }
    // Verify the actual data
    TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId, dagId);
    Path outputFilePath = kvWriter.finalOutPath;
    Path spillFilePath = kvWriter.finalIndexPath;
    if (numRecordsWritten <= 0) {
        return;
    }
    assertTrue(localFs.exists(outputFilePath));
    assertTrue(localFs.exists(spillFilePath));
    // verify no intermediate spill files have been left around
    synchronized (kvWriter.spillInfoList) {
        for (SpillInfo spill : kvWriter.spillInfoList) {
            assertFalse("lingering intermediate spill file " + spill.outPath, localFs.exists(spill.outPath));
        }
    }
    // Special case for 0 records.
    TezSpillRecord spillRecord = new TezSpillRecord(spillFilePath, conf);
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataInputBuffer valBuffer = new DataInputBuffer();
    IntWritable keyDeser = new IntWritable();
    LongWritable valDeser = new LongWritable();
    for (int i = 0; i < numOutputs; i++) {
        TezIndexRecord indexRecord = spillRecord.getIndex(i);
        if (skippedPartitions != null && skippedPartitions.contains(i)) {
            assertFalse("The Index Record for partition " + i + " should not have any data", indexRecord.hasData());
            continue;
        }
        FSDataInputStream inStream = FileSystem.getLocal(conf).open(outputFilePath);
        inStream.seek(indexRecord.getStartOffset());
        IFile.Reader reader = new IFile.Reader(inStream, indexRecord.getPartLength(), codec, null, null, false, 0, -1);
        while (reader.nextRawKey(keyBuffer)) {
            reader.nextRawValue(valBuffer);
            keyDeser.readFields(keyBuffer);
            valDeser.readFields(valBuffer);
            int partition = partitioner.getPartition(keyDeser, valDeser, numOutputs);
            assertTrue(expectedValues.get(partition).remove(keyDeser.get(), valDeser.get()));
        }
        inStream.close();
    }
    for (int i = 0; i < numOutputs; i++) {
        assertEquals(0, expectedValues.get(i).size());
        expectedValues.remove(i);
    }
    assertEquals(0, expectedValues.size());
    verify(outputContext, atLeast(1)).notifyProgress();
}
Also used : TezTaskOutputFiles(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles) IFile(org.apache.tez.runtime.library.common.sort.impl.IFile) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) HashMap(java.util.HashMap) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) ByteString(com.google.protobuf.ByteString) Configurable(org.apache.hadoop.conf.Configurable) TezCounter(org.apache.tez.common.counters.TezCounter) TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) SpillInfo(org.apache.tez.runtime.library.common.writers.UnorderedPartitionedKVWriter.SpillInfo) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) TezCounters(org.apache.tez.common.counters.TezCounters) OutputContext(org.apache.tez.runtime.api.OutputContext) LinkedListMultimap(com.google.common.collect.LinkedListMultimap) Multimap(com.google.common.collect.Multimap) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) TaskFailureType(org.apache.tez.runtime.api.TaskFailureType) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) TezIndexRecord(org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) TezTaskOutput(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutput)

Example 5 with TezSpillRecord

use of org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord in project tez by apache.

the class DefaultSorter method maybeAddEventsForSpills.

private void maybeAddEventsForSpills() throws IOException {
    if (isFinalMergeEnabled()) {
        return;
    }
    List<Event> events = Lists.newLinkedList();
    for (int i = 0; i < numSpills; i++) {
        TezSpillRecord spillRecord = indexCacheList.get(i);
        if (spillRecord == null) {
            // File was already written and location is stored in spillFileIndexPaths
            spillRecord = new TezSpillRecord(spillFileIndexPaths.get(i), conf);
        } else {
            // Double check if this file has to be written
            if (spillFileIndexPaths.get(i) == null) {
                Path indexPath = mapOutputFile.getSpillIndexFileForWrite(i, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
                spillRecord.writeToFile(indexPath, conf);
            }
        }
        maybeSendEventForSpill(events, (i == numSpills - 1), spillRecord, i, false);
        fileOutputByteCounter.increment(rfs.getFileStatus(spillFilePaths.get(i)).getLen());
    }
    outputContext.sendEvents(events);
}
Also used : TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) Path(org.apache.hadoop.fs.Path) Event(org.apache.tez.runtime.api.Event)

Aggregations

TezSpillRecord (org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord)20 Path (org.apache.hadoop.fs.Path)14 TezIndexRecord (org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord)11 Event (org.apache.tez.runtime.api.Event)9 BitSet (java.util.BitSet)7 ByteString (com.google.protobuf.ByteString)6 CompositeDataMovementEvent (org.apache.tez.runtime.api.events.CompositeDataMovementEvent)6 IFile (org.apache.tez.runtime.library.common.sort.impl.IFile)6 IOException (java.io.IOException)5 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)5 VertexManagerEvent (org.apache.tez.runtime.api.events.VertexManagerEvent)5 Writer (org.apache.tez.runtime.library.common.sort.impl.IFile.Writer)5 DataInputBuffer (org.apache.hadoop.io.DataInputBuffer)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)3 OutputContext (org.apache.tez.runtime.api.OutputContext)3 ShuffleUserPayloads (org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads)3 Test (org.junit.Test)3 Mockito.anyString (org.mockito.Mockito.anyString)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 LinkedListMultimap (com.google.common.collect.LinkedListMultimap)2