Search in sources :

Example 1 with InMemoryWriter

use of org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter in project tez by apache.

the class TestValuesIterator method createInMemStreams.

/**
 * create inmemory segments
 *
 * @return
 * @throws IOException
 */
@SuppressWarnings("unchecked")
public List<TezMerger.Segment> createInMemStreams() throws IOException {
    int numberOfStreams = Math.max(2, rnd.nextInt(10));
    LOG.info("No of streams : " + numberOfStreams);
    SerializationFactory serializationFactory = new SerializationFactory(conf);
    Serializer keySerializer = serializationFactory.getSerializer(keyClass);
    Serializer valueSerializer = serializationFactory.getSerializer(valClass);
    LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS);
    InputContext context = createTezInputContext();
    MergeManager mergeManager = new MergeManager(conf, fs, localDirAllocator, context, null, null, null, null, null, 1024 * 1024 * 10, null, false, -1);
    DataOutputBuffer keyBuf = new DataOutputBuffer();
    DataOutputBuffer valBuf = new DataOutputBuffer();
    DataInputBuffer keyIn = new DataInputBuffer();
    DataInputBuffer valIn = new DataInputBuffer();
    keySerializer.open(keyBuf);
    valueSerializer.open(valBuf);
    List<TezMerger.Segment> segments = new LinkedList<TezMerger.Segment>();
    for (int i = 0; i < numberOfStreams; i++) {
        BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024);
        InMemoryWriter writer = new InMemoryWriter(bout);
        Map<Writable, Writable> data = createData();
        // write data
        for (Map.Entry<Writable, Writable> entry : data.entrySet()) {
            keySerializer.serialize(entry.getKey());
            valueSerializer.serialize(entry.getValue());
            keyIn.reset(keyBuf.getData(), 0, keyBuf.getLength());
            valIn.reset(valBuf.getData(), 0, valBuf.getLength());
            writer.append(keyIn, valIn);
            originalData.put(entry.getKey(), entry.getValue());
            keyBuf.reset();
            valBuf.reset();
            keyIn.reset();
            valIn.reset();
        }
        IFile.Reader reader = new InMemoryReader(mergeManager, null, bout.getBuffer(), 0, bout.getBuffer().length);
        segments.add(new TezMerger.Segment(reader, null));
        data.clear();
        writer.close();
    }
    return segments;
}
Also used : InMemoryReader(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryReader) IFile(org.apache.tez.runtime.library.common.sort.impl.IFile) InputContext(org.apache.tez.runtime.api.InputContext) SerializationFactory(org.apache.hadoop.io.serializer.SerializationFactory) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) MergeManager(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.MergeManager) LinkedList(java.util.LinkedList) TezMerger(org.apache.tez.runtime.library.common.sort.impl.TezMerger) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) BoundedByteArrayOutputStream(org.apache.hadoop.io.BoundedByteArrayOutputStream) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) LocalDirAllocator(org.apache.hadoop.fs.LocalDirAllocator) Map(java.util.Map) TreeMap(java.util.TreeMap) Serializer(org.apache.hadoop.io.serializer.Serializer)

Example 2 with InMemoryWriter

use of org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter in project tez by apache.

the class TestTezMerger method createInMemorySegments.

private List<TezMerger.Segment> createInMemorySegments(int segmentCount, int keysPerSegment) throws IOException {
    List<TezMerger.Segment> segmentList = Lists.newLinkedList();
    Random rnd = new Random();
    DataInputBuffer key = new DataInputBuffer();
    DataInputBuffer value = new DataInputBuffer();
    for (int i = 0; i < segmentCount; i++) {
        BoundedByteArrayOutputStream stream = new BoundedByteArrayOutputStream(10000);
        InMemoryWriter writer = new InMemoryWriter(stream);
        for (int j = 0; j < keysPerSegment; j++) {
            populateData(new IntWritable(rnd.nextInt()), new LongWritable(rnd.nextLong()), key, value);
            writer.append(key, value);
        }
        writer.close();
        InMemoryReader reader = new InMemoryReader(merger, null, stream.getBuffer(), 0, stream.getLimit());
        segmentList.add(new TezMerger.Segment(reader, null));
    }
    return segmentList;
}
Also used : InMemoryReader(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryReader) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) Random(java.util.Random) BoundedByteArrayOutputStream(org.apache.hadoop.io.BoundedByteArrayOutputStream) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable)

Example 3 with InMemoryWriter

use of org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter in project tez by apache.

the class TestIFile method testWithRLEMarker.

@Test(timeout = 5000)
public // test with sorted data and repeat keys
void testWithRLEMarker() throws IOException {
    // Test with append(Object, Object)
    FSDataOutputStream out = localFs.create(outputPath);
    IFile.Writer writer = new IFile.Writer(defaultConf, out, Text.class, IntWritable.class, codec, null, null, true);
    Text key = new Text("key0");
    IntWritable value = new IntWritable(0);
    writer.append(key, value);
    // same key (RLE should kick in)
    key = new Text("key0");
    writer.append(key, value);
    assertTrue(writer.sameKey);
    // Different key
    key = new Text("key1");
    writer.append(key, value);
    assertFalse(writer.sameKey);
    writer.close();
    out.close();
    // Test with append(DataInputBuffer key, DataInputBuffer value)
    byte[] kvbuffer = "key1Value1key1Value2key3Value3".getBytes();
    int keyLength = 4;
    int valueLength = 6;
    int pos = 0;
    out = localFs.create(outputPath);
    writer = new IFile.Writer(defaultConf, out, Text.class, IntWritable.class, codec, null, null, true);
    BoundedByteArrayOutputStream boundedOut = new BoundedByteArrayOutputStream(1024 * 1024);
    Writer inMemWriter = new InMemoryWriter(boundedOut, true);
    DataInputBuffer kin = new DataInputBuffer();
    kin.reset(kvbuffer, pos, keyLength);
    DataInputBuffer vin = new DataInputBuffer();
    DataOutputBuffer vout = new DataOutputBuffer();
    (new IntWritable(0)).write(vout);
    vin.reset(vout.getData(), vout.getLength());
    // Write initial KV pair
    writer.append(kin, vin);
    assertFalse(writer.sameKey);
    inMemWriter.append(kin, vin);
    assertFalse(inMemWriter.sameKey);
    pos += (keyLength + valueLength);
    // Second key is similar to key1 (RLE should kick in)
    kin.reset(kvbuffer, pos, keyLength);
    (new IntWritable(0)).write(vout);
    vin.reset(vout.getData(), vout.getLength());
    writer.append(kin, vin);
    assertTrue(writer.sameKey);
    inMemWriter.append(kin, vin);
    assertTrue(inMemWriter.sameKey);
    pos += (keyLength + valueLength);
    // Next key (key3) is different (RLE should not kick in)
    kin.reset(kvbuffer, pos, keyLength);
    (new IntWritable(0)).write(vout);
    vin.reset(vout.getData(), vout.getLength());
    writer.append(kin, vin);
    assertFalse(writer.sameKey);
    inMemWriter.append(kin, vin);
    assertFalse(inMemWriter.sameKey);
    writer.close();
    out.close();
    inMemWriter.close();
    boundedOut.close();
}
Also used : InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) BoundedByteArrayOutputStream(org.apache.hadoop.io.BoundedByteArrayOutputStream) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) Text(org.apache.hadoop.io.Text) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 4 with InMemoryWriter

use of org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter in project tez by apache.

the class TestIFile method testInMemoryWriter.

@Test(timeout = 5000)
public // Test InMemoryWriter
void testInMemoryWriter() throws IOException {
    InMemoryWriter writer = null;
    BoundedByteArrayOutputStream bout = new BoundedByteArrayOutputStream(1024 * 1024);
    List<KVPair> data = KVDataGen.generateTestData(true, 10);
    // No RLE, No RepeatKeys, no compression
    writer = new InMemoryWriter(bout);
    writeTestFileUsingDataBuffer(writer, false, data);
    readUsingInMemoryReader(bout.getBuffer(), data);
    // No RLE, RepeatKeys, no compression
    bout.reset();
    writer = new InMemoryWriter(bout);
    writeTestFileUsingDataBuffer(writer, true, data);
    readUsingInMemoryReader(bout.getBuffer(), data);
    // RLE, No RepeatKeys, no compression
    bout.reset();
    writer = new InMemoryWriter(bout, true);
    writeTestFileUsingDataBuffer(writer, false, data);
    readUsingInMemoryReader(bout.getBuffer(), data);
    // RLE, RepeatKeys, no compression
    bout.reset();
    writer = new InMemoryWriter(bout, true);
    writeTestFileUsingDataBuffer(writer, true, data);
    readUsingInMemoryReader(bout.getBuffer(), data);
}
Also used : InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) BoundedByteArrayOutputStream(org.apache.hadoop.io.BoundedByteArrayOutputStream) KVPair(org.apache.tez.runtime.library.testutils.KVDataGen.KVPair) Test(org.junit.Test)

Aggregations

BoundedByteArrayOutputStream (org.apache.hadoop.io.BoundedByteArrayOutputStream)4 InMemoryWriter (org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter)4 DataInputBuffer (org.apache.hadoop.io.DataInputBuffer)3 IntWritable (org.apache.hadoop.io.IntWritable)3 DataOutputBuffer (org.apache.hadoop.io.DataOutputBuffer)2 LongWritable (org.apache.hadoop.io.LongWritable)2 InMemoryReader (org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryReader)2 Test (org.junit.Test)2 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 Random (java.util.Random)1 TreeMap (java.util.TreeMap)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 LocalDirAllocator (org.apache.hadoop.fs.LocalDirAllocator)1 BytesWritable (org.apache.hadoop.io.BytesWritable)1 Text (org.apache.hadoop.io.Text)1 Writable (org.apache.hadoop.io.Writable)1 SerializationFactory (org.apache.hadoop.io.serializer.SerializationFactory)1 Serializer (org.apache.hadoop.io.serializer.Serializer)1 InputContext (org.apache.tez.runtime.api.InputContext)1