Search in sources :

Example 1 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestIFile method writeTestFileUsingDataBuffer.

private Writer writeTestFileUsingDataBuffer(boolean rle, boolean repeatKeys, List<KVPair> data, CompressionCodec codec) throws IOException {
    FSDataOutputStream out = localFs.create(outputPath);
    IFile.Writer writer = new IFile.Writer(defaultConf, out, Text.class, IntWritable.class, codec, null, null, rle);
    writeTestFileUsingDataBuffer(writer, repeatKeys, data);
    out.close();
    return writer;
}
Also used : FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter)

Example 2 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestIFile method testAppendValues.

@Test(timeout = 5000)
public // Test appendValues feature
void testAppendValues() throws IOException {
    List<KVPair> data = new ArrayList<KVPair>();
    List<IntWritable> values = new ArrayList<IntWritable>();
    Text key = new Text("key");
    IntWritable val = new IntWritable(1);
    for (int i = 0; i < 5; i++) {
        data.add(new KVPair(key, val));
        values.add(val);
    }
    IFile.Writer writer = new IFile.Writer(defaultConf, localFs, outputPath, Text.class, IntWritable.class, codec, null, null);
    // write first KV pair
    writer.append(data.get(0).getKey(), data.get(0).getvalue());
    // add the rest here
    writer.appendValues(values.subList(1, values.size()).iterator());
    Text lastKey = new Text("key3");
    IntWritable lastVal = new IntWritable(10);
    data.add(new KVPair(lastKey, lastVal));
    writer.append(lastKey, lastVal);
    writer.close();
    readAndVerifyData(writer.getRawLength(), writer.getCompressedLength(), data, codec);
}
Also used : KVPair(org.apache.tez.runtime.library.testutils.KVDataGen.KVPair) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) IntWritable(org.apache.hadoop.io.IntWritable) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) Test(org.junit.Test)

Example 3 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestIFile method testAppendValue.

@Test(timeout = 5000)
public // Test appendValue feature
void testAppendValue() throws IOException {
    List<KVPair> data = KVDataGen.generateTestData(false, rnd.nextInt(100));
    IFile.Writer writer = new IFile.Writer(defaultConf, localFs, outputPath, Text.class, IntWritable.class, codec, null, null);
    Text previousKey = null;
    for (KVPair kvp : data) {
        if ((previousKey != null && previousKey.compareTo(kvp.getKey()) == 0)) {
            writer.appendValue(kvp.getvalue());
        } else {
            writer.append(kvp.getKey(), kvp.getvalue());
        }
        previousKey = kvp.getKey();
    }
    writer.close();
    readAndVerifyData(writer.getRawLength(), writer.getCompressedLength(), data, codec);
}
Also used : KVPair(org.apache.tez.runtime.library.testutils.KVDataGen.KVPair) Text(org.apache.hadoop.io.Text) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) Test(org.junit.Test)

Example 4 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestIFile method writeTestFile.

private Writer writeTestFile(boolean rle, boolean repeatKeys, List<KVPair> data, CompressionCodec codec) throws IOException {
    FSDataOutputStream out = localFs.create(outputPath);
    IFile.Writer writer = new IFile.Writer(defaultConf, out, Text.class, IntWritable.class, codec, null, null, rle);
    writeTestFile(writer, repeatKeys, data);
    out.close();
    return writer;
}
Also used : FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter)

Example 5 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestIFile method testWithRLEMarker.

@Test(timeout = 5000)
public // test with sorted data and repeat keys
void testWithRLEMarker() throws IOException {
    // Test with append(Object, Object)
    FSDataOutputStream out = localFs.create(outputPath);
    IFile.Writer writer = new IFile.Writer(defaultConf, out, Text.class, IntWritable.class, codec, null, null, true);
    Text key = new Text("key0");
    IntWritable value = new IntWritable(0);
    writer.append(key, value);
    // same key (RLE should kick in)
    key = new Text("key0");
    writer.append(key, value);
    assertTrue(writer.sameKey);
    // Different key
    key = new Text("key1");
    writer.append(key, value);
    assertFalse(writer.sameKey);
    writer.close();
    out.close();
    // Test with append(DataInputBuffer key, DataInputBuffer value)
    byte[] kvbuffer = "key1Value1key1Value2key3Value3".getBytes();
    int keyLength = 4;
    int valueLength = 6;
    int pos = 0;
    out = localFs.create(outputPath);
    writer = new IFile.Writer(defaultConf, out, Text.class, IntWritable.class, codec, null, null, true);
    BoundedByteArrayOutputStream boundedOut = new BoundedByteArrayOutputStream(1024 * 1024);
    Writer inMemWriter = new InMemoryWriter(boundedOut, true);
    DataInputBuffer kin = new DataInputBuffer();
    kin.reset(kvbuffer, pos, keyLength);
    DataInputBuffer vin = new DataInputBuffer();
    DataOutputBuffer vout = new DataOutputBuffer();
    (new IntWritable(0)).write(vout);
    vin.reset(vout.getData(), vout.getLength());
    // Write initial KV pair
    writer.append(kin, vin);
    assertFalse(writer.sameKey);
    inMemWriter.append(kin, vin);
    assertFalse(inMemWriter.sameKey);
    pos += (keyLength + valueLength);
    // Second key is similar to key1 (RLE should kick in)
    kin.reset(kvbuffer, pos, keyLength);
    (new IntWritable(0)).write(vout);
    vin.reset(vout.getData(), vout.getLength());
    writer.append(kin, vin);
    assertTrue(writer.sameKey);
    inMemWriter.append(kin, vin);
    assertTrue(inMemWriter.sameKey);
    pos += (keyLength + valueLength);
    // Next key (key3) is different (RLE should not kick in)
    kin.reset(kvbuffer, pos, keyLength);
    (new IntWritable(0)).write(vout);
    vin.reset(vout.getData(), vout.getLength());
    writer.append(kin, vin);
    assertFalse(writer.sameKey);
    inMemWriter.append(kin, vin);
    assertFalse(inMemWriter.sameKey);
    writer.close();
    out.close();
    inMemWriter.close();
    boundedOut.close();
}
Also used : InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) BoundedByteArrayOutputStream(org.apache.hadoop.io.BoundedByteArrayOutputStream) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) Text(org.apache.hadoop.io.Text) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Aggregations

Writer (org.apache.tez.runtime.library.common.sort.impl.IFile.Writer)25 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)12 Test (org.junit.Test)12 InMemoryWriter (org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter)11 Path (org.apache.hadoop.fs.Path)8 DataInputBuffer (org.apache.hadoop.io.DataInputBuffer)6 ArrayList (java.util.ArrayList)5 Text (org.apache.hadoop.io.Text)5 TezIndexRecord (org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord)5 TezSpillRecord (org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord)5 KVPair (org.apache.tez.runtime.library.testutils.KVDataGen.KVPair)5 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)4 TaskContext (org.apache.tez.runtime.api.TaskContext)4 IFile (org.apache.tez.runtime.library.common.sort.impl.IFile)4 IOException (java.io.IOException)3 IntWritable (org.apache.hadoop.io.IntWritable)3 Reader (org.apache.tez.runtime.library.common.sort.impl.IFile.Reader)3 DiskSegment (org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment)3 Segment (org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment)3 TezRawKeyValueIterator (org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator)3