Search in sources :

Example 1 with HashPartitioner

use of org.apache.tez.runtime.library.partitioner.HashPartitioner in project tez by apache.

the class TestUnorderedPartitionedKVWriter method textTest.

public void textTest(int numRegularRecords, int numPartitions, long availableMemory, int numLargeKeys, int numLargevalues, int numLargeKvPairs, boolean pipeliningEnabled, boolean isFinalMergeEnabled) throws IOException, InterruptedException {
    Partitioner partitioner = new HashPartitioner();
    ApplicationId appId = ApplicationId.newInstance(10000000, 1);
    TezCounters counters = new TezCounters();
    String uniqueId = UUID.randomUUID().toString();
    int dagId = 1;
    String auxiliaryService = defaultConf.get(TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID, TezConfiguration.TEZ_AM_SHUFFLE_AUXILIARY_SERVICE_ID_DEFAULT);
    OutputContext outputContext = createMockOutputContext(counters, appId, uniqueId, auxiliaryService);
    Random random = new Random();
    Configuration conf = createConfiguration(outputContext, Text.class, Text.class, shouldCompress, -1, HashPartitioner.class);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_PIPELINED_SHUFFLE_ENABLED, pipeliningEnabled);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_ENABLE_FINAL_MERGE_IN_OUTPUT, isFinalMergeEnabled);
    CompressionCodec codec = null;
    if (shouldCompress) {
        codec = new DefaultCodec();
        ((Configurable) codec).setConf(conf);
    }
    int numRecordsWritten = 0;
    Map<Integer, Multimap<String, String>> expectedValues = new HashMap<Integer, Multimap<String, String>>();
    for (int i = 0; i < numPartitions; i++) {
        expectedValues.put(i, LinkedListMultimap.<String, String>create());
    }
    UnorderedPartitionedKVWriter kvWriter = new UnorderedPartitionedKVWriterForTest(outputContext, conf, numPartitions, availableMemory);
    int sizePerBuffer = kvWriter.sizePerBuffer;
    BitSet partitionsWithData = new BitSet(numPartitions);
    Text keyText = new Text();
    Text valText = new Text();
    for (int i = 0; i < numRegularRecords; i++) {
        String key = createRandomString(Math.abs(random.nextInt(10)));
        String val = createRandomString(Math.abs(random.nextInt(20)));
        keyText.set(key);
        valText.set(val);
        int partition = partitioner.getPartition(keyText, valText, numPartitions);
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(key, val);
        kvWriter.write(keyText, valText);
        numRecordsWritten++;
    }
    // Write Large key records
    for (int i = 0; i < numLargeKeys; i++) {
        String key = createRandomString(sizePerBuffer + Math.abs(random.nextInt(100)));
        String val = createRandomString(Math.abs(random.nextInt(20)));
        keyText.set(key);
        valText.set(val);
        int partition = partitioner.getPartition(keyText, valText, numPartitions);
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(key, val);
        kvWriter.write(keyText, valText);
        numRecordsWritten++;
    }
    if (pipeliningEnabled) {
        verify(outputContext, times(numLargeKeys)).sendEvents(anyListOf(Event.class));
    }
    // Write Large val records
    for (int i = 0; i < numLargevalues; i++) {
        String key = createRandomString(Math.abs(random.nextInt(10)));
        String val = createRandomString(sizePerBuffer + Math.abs(random.nextInt(100)));
        keyText.set(key);
        valText.set(val);
        int partition = partitioner.getPartition(keyText, valText, numPartitions);
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(key, val);
        kvWriter.write(keyText, valText);
        numRecordsWritten++;
    }
    if (pipeliningEnabled) {
        verify(outputContext, times(numLargevalues + numLargeKeys)).sendEvents(anyListOf(Event.class));
    }
    // Write records where key + val are large (but both can fit in the buffer individually)
    for (int i = 0; i < numLargeKvPairs; i++) {
        String key = createRandomString(sizePerBuffer / 2 + Math.abs(random.nextInt(100)));
        String val = createRandomString(sizePerBuffer / 2 + Math.abs(random.nextInt(100)));
        keyText.set(key);
        valText.set(val);
        int partition = partitioner.getPartition(keyText, valText, numPartitions);
        partitionsWithData.set(partition);
        expectedValues.get(partition).put(key, val);
        kvWriter.write(keyText, valText);
        numRecordsWritten++;
    }
    if (pipeliningEnabled) {
        verify(outputContext, times(numLargevalues + numLargeKeys + numLargeKvPairs)).sendEvents(anyListOf(Event.class));
    }
    List<Event> events = kvWriter.close();
    verify(outputContext, never()).reportFailure(any(TaskFailureType.class), any(Throwable.class), any(String.class));
    if (!pipeliningEnabled) {
        VertexManagerEvent vmEvent = null;
        for (Event event : events) {
            if (event instanceof VertexManagerEvent) {
                assertNull(vmEvent);
                vmEvent = (VertexManagerEvent) event;
            }
        }
        VertexManagerEventPayloadProto vmEventPayload = VertexManagerEventPayloadProto.parseFrom(ByteString.copyFrom(vmEvent.getUserPayload().asReadOnlyBuffer()));
        assertEquals(numRecordsWritten, vmEventPayload.getNumRecord());
    }
    TezCounter outputLargeRecordsCounter = counters.findCounter(TaskCounter.OUTPUT_LARGE_RECORDS);
    assertEquals(numLargeKeys + numLargevalues + numLargeKvPairs, outputLargeRecordsCounter.getValue());
    if (pipeliningEnabled || !isFinalMergeEnabled) {
        // verify spill data files and index file exist
        for (int i = 0; i < kvWriter.numSpills.get(); i++) {
            assertTrue(localFs.exists(kvWriter.outputFileHandler.getSpillFileForWrite(i, 0)));
            assertTrue(localFs.exists(kvWriter.outputFileHandler.getSpillIndexFileForWrite(i, 0)));
        }
        return;
    }
    // Validate the events
    assertEquals(2, events.size());
    assertTrue(events.get(0) instanceof VertexManagerEvent);
    VertexManagerEvent vme = (VertexManagerEvent) events.get(0);
    verifyPartitionStats(vme, partitionsWithData);
    assertTrue(events.get(1) instanceof CompositeDataMovementEvent);
    CompositeDataMovementEvent cdme = (CompositeDataMovementEvent) events.get(1);
    assertEquals(0, cdme.getSourceIndexStart());
    assertEquals(numPartitions, cdme.getCount());
    DataMovementEventPayloadProto eventProto = DataMovementEventPayloadProto.parseFrom(ByteString.copyFrom(cdme.getUserPayload()));
    BitSet emptyPartitionBits = null;
    if (partitionsWithData.cardinality() != numPartitions) {
        assertTrue(eventProto.hasEmptyPartitions());
        byte[] emptyPartitions = TezCommonUtils.decompressByteStringToByteArray(eventProto.getEmptyPartitions());
        emptyPartitionBits = TezUtilsInternal.fromByteArray(emptyPartitions);
        assertEquals(numPartitions - partitionsWithData.cardinality(), emptyPartitionBits.cardinality());
    } else {
        assertFalse(eventProto.hasEmptyPartitions());
        emptyPartitionBits = new BitSet(numPartitions);
    }
    assertEquals(HOST_STRING, eventProto.getHost());
    assertEquals(SHUFFLE_PORT, eventProto.getPort());
    assertEquals(uniqueId, eventProto.getPathComponent());
    // Verify the data
    // Verify the actual data
    TezTaskOutput taskOutput = new TezTaskOutputFiles(conf, uniqueId, dagId);
    Path outputFilePath = kvWriter.finalOutPath;
    Path spillFilePath = kvWriter.finalIndexPath;
    if (numRecordsWritten > 0) {
        assertTrue(localFs.exists(outputFilePath));
        assertTrue(localFs.exists(spillFilePath));
        assertEquals("Incorrect output permissions", (short) 0640, localFs.getFileStatus(outputFilePath).getPermission().toShort());
        assertEquals("Incorrect index permissions", (short) 0640, localFs.getFileStatus(spillFilePath).getPermission().toShort());
    } else {
        return;
    }
    // Special case for 0 records.
    TezSpillRecord spillRecord = new TezSpillRecord(spillFilePath, conf);
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataInputBuffer valBuffer = new DataInputBuffer();
    Text keyDeser = new Text();
    Text valDeser = new Text();
    for (int i = 0; i < numPartitions; i++) {
        if (emptyPartitionBits.get(i)) {
            continue;
        }
        TezIndexRecord indexRecord = spillRecord.getIndex(i);
        FSDataInputStream inStream = FileSystem.getLocal(conf).open(outputFilePath);
        inStream.seek(indexRecord.getStartOffset());
        IFile.Reader reader = new IFile.Reader(inStream, indexRecord.getPartLength(), codec, null, null, false, 0, -1);
        while (reader.nextRawKey(keyBuffer)) {
            reader.nextRawValue(valBuffer);
            keyDeser.readFields(keyBuffer);
            valDeser.readFields(valBuffer);
            int partition = partitioner.getPartition(keyDeser, valDeser, numPartitions);
            assertTrue(expectedValues.get(partition).remove(keyDeser.toString(), valDeser.toString()));
        }
        inStream.close();
    }
    for (int i = 0; i < numPartitions; i++) {
        assertEquals(0, expectedValues.get(i).size());
        expectedValues.remove(i);
    }
    assertEquals(0, expectedValues.size());
}
Also used : TezTaskOutputFiles(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutputFiles) IFile(org.apache.tez.runtime.library.common.sort.impl.IFile) Configuration(org.apache.hadoop.conf.Configuration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) HashMap(java.util.HashMap) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) ByteString(com.google.protobuf.ByteString) Configurable(org.apache.hadoop.conf.Configurable) TezCounter(org.apache.tez.common.counters.TezCounter) TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) Random(java.util.Random) DataMovementEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.DataMovementEventPayloadProto) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Partitioner(org.apache.tez.runtime.library.api.Partitioner) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) Text(org.apache.hadoop.io.Text) TezCounters(org.apache.tez.common.counters.TezCounters) OutputContext(org.apache.tez.runtime.api.OutputContext) LinkedListMultimap(com.google.common.collect.LinkedListMultimap) Multimap(com.google.common.collect.Multimap) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) TaskFailureType(org.apache.tez.runtime.api.TaskFailureType) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) TezIndexRecord(org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord) HashPartitioner(org.apache.tez.runtime.library.partitioner.HashPartitioner) Event(org.apache.tez.runtime.api.Event) VertexManagerEvent(org.apache.tez.runtime.api.events.VertexManagerEvent) CompositeDataMovementEvent(org.apache.tez.runtime.api.events.CompositeDataMovementEvent) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) VertexManagerEventPayloadProto(org.apache.tez.runtime.library.shuffle.impl.ShuffleUserPayloads.VertexManagerEventPayloadProto) TezTaskOutput(org.apache.tez.runtime.library.common.task.local.output.TezTaskOutput)

Aggregations

LinkedListMultimap (com.google.common.collect.LinkedListMultimap)1 Multimap (com.google.common.collect.Multimap)1 ByteString (com.google.protobuf.ByteString)1 BitSet (java.util.BitSet)1 HashMap (java.util.HashMap)1 Random (java.util.Random)1 Configurable (org.apache.hadoop.conf.Configurable)1 Configuration (org.apache.hadoop.conf.Configuration)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 Path (org.apache.hadoop.fs.Path)1 DataInputBuffer (org.apache.hadoop.io.DataInputBuffer)1 Text (org.apache.hadoop.io.Text)1 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)1 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)1 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)1 TezCounter (org.apache.tez.common.counters.TezCounter)1 TezCounters (org.apache.tez.common.counters.TezCounters)1 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)1 Event (org.apache.tez.runtime.api.Event)1 OutputContext (org.apache.tez.runtime.api.OutputContext)1