Search in sources :

Example 16 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method testLongDictionary.

@Test
public void testLongDictionary() throws IOException {
    int COUNT = 1000;
    int COUNT2 = 2000;
    final FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw = newPlainLongDictionaryValuesWriter(10000, 10000);
    for (long i = 0; i < COUNT; i++) {
        cw.writeLong(i % 50);
    }
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    assertEquals(50, cw.initialWriter.getDictionarySize());
    for (long i = COUNT2; i > 0; i--) {
        cw.writeLong(i % 50);
    }
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    assertEquals(50, cw.initialWriter.getDictionarySize());
    DictionaryValuesReader cr = initDicReader(cw, PrimitiveTypeName.INT64);
    cr.initFromPage(COUNT, bytes1.toInputStream());
    for (long i = 0; i < COUNT; i++) {
        long back = cr.readLong();
        assertEquals(i % 50, back);
    }
    cr.initFromPage(COUNT2, bytes2.toInputStream());
    for (long i = COUNT2; i > 0; i--) {
        long back = cr.readLong();
        assertEquals(i % 50, back);
    }
}
Also used : PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) BytesInput(org.apache.parquet.bytes.BytesInput) PlainLongDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter) Test(org.junit.Test)

Example 17 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method testSecondPageFallBack.

@Test
public void testSecondPageFallBack() throws IOException {
    int COUNT = 1000;
    ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
    writeRepeated(COUNT, cw, "a");
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    writeDistinct(COUNT, cw, "b");
    // not efficient so falls back
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
    writeRepeated(COUNT, cw, "a");
    // still plain because we fell back on previous page
    BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
    ValuesReader cr = initDicReader(cw, BINARY);
    checkRepeated(COUNT, bytes1, cr, "a");
    cr = new BinaryPlainValuesReader();
    checkDistinct(COUNT, bytes2, cr, "b");
    checkRepeated(COUNT, bytes3, cr, "a");
}
Also used : ValuesReader(org.apache.parquet.column.values.ValuesReader) PlainValuesReader(org.apache.parquet.column.values.plain.PlainValuesReader) BinaryPlainValuesReader(org.apache.parquet.column.values.plain.BinaryPlainValuesReader) BinaryPlainValuesReader(org.apache.parquet.column.values.plain.BinaryPlainValuesReader) BytesInput(org.apache.parquet.bytes.BytesInput) PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) PlainIntegerDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter) PlainFloatDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) PlainBinaryDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter) PlainLongDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter) FallbackValuesWriter(org.apache.parquet.column.values.fallback.FallbackValuesWriter) PlainDoubleDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter) Test(org.junit.Test)

Example 18 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDirectCodecFactory method test.

private void test(int size, CompressionCodecName codec, boolean useOnHeapCompression, Decompression decomp) {
    ByteBuffer rawBuf = null;
    ByteBuffer outBuf = null;
    ByteBufferAllocator allocator = null;
    try {
        allocator = new DirectByteBufferAllocator();
        final CodecFactory codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(), allocator, pageSize);
        rawBuf = allocator.allocate(size);
        final byte[] rawArr = new byte[size];
        outBuf = allocator.allocate(size * 2);
        final Random r = new Random();
        final byte[] random = new byte[1024];
        int pos = 0;
        while (pos < size) {
            r.nextBytes(random);
            rawBuf.put(random);
            System.arraycopy(random, 0, rawArr, pos, random.length);
            pos += random.length;
        }
        rawBuf.flip();
        final DirectCodecFactory.BytesCompressor c = codecFactory.getCompressor(codec);
        final CodecFactory.BytesDecompressor d = codecFactory.getDecompressor(codec);
        final BytesInput compressed;
        if (useOnHeapCompression) {
            compressed = c.compress(BytesInput.from(rawArr));
        } else {
            compressed = c.compress(BytesInput.from(rawBuf));
        }
        switch(decomp) {
            case OFF_HEAP:
                {
                    final ByteBuffer buf = compressed.toByteBuffer();
                    final ByteBuffer b = allocator.allocate(buf.capacity());
                    try {
                        b.put(buf);
                        b.flip();
                        d.decompress(b, (int) compressed.size(), outBuf, size);
                        for (int i = 0; i < size; i++) {
                            Assert.assertTrue("Data didn't match at " + i, outBuf.get(i) == rawBuf.get(i));
                        }
                    } finally {
                        allocator.release(b);
                    }
                    break;
                }
            case OFF_HEAP_BYTES_INPUT:
                {
                    final ByteBuffer buf = compressed.toByteBuffer();
                    final ByteBuffer b = allocator.allocate(buf.limit());
                    try {
                        b.put(buf);
                        b.flip();
                        final BytesInput input = d.decompress(BytesInput.from(b), size);
                        Assert.assertArrayEquals(String.format("While testing codec %s", codec), input.toByteArray(), rawArr);
                    } finally {
                        allocator.release(b);
                    }
                    break;
                }
            case ON_HEAP:
                {
                    final byte[] buf = compressed.toByteArray();
                    final BytesInput input = d.decompress(BytesInput.from(buf), size);
                    Assert.assertArrayEquals(input.toByteArray(), rawArr);
                    break;
                }
        }
    } catch (Exception e) {
        final String msg = String.format("Failure while testing Codec: %s, OnHeapCompressionInput: %s, Decompression Mode: %s, Data Size: %d", codec.name(), useOnHeapCompression, decomp.name(), size);
        System.out.println(msg);
        throw new RuntimeException(msg, e);
    } finally {
        if (rawBuf != null) {
            allocator.release(rawBuf);
        }
        if (outBuf != null) {
            allocator.release(rawBuf);
        }
    }
}
Also used : DirectByteBufferAllocator(org.apache.parquet.bytes.DirectByteBufferAllocator) Configuration(org.apache.hadoop.conf.Configuration) BytesInput(org.apache.parquet.bytes.BytesInput) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) ByteBufferAllocator(org.apache.parquet.bytes.ByteBufferAllocator) DirectByteBufferAllocator(org.apache.parquet.bytes.DirectByteBufferAllocator) ByteBuffer(java.nio.ByteBuffer) Random(java.util.Random)

Example 19 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class ColumnWriterV2 method writePage.

/**
 * writes the current data to a new page in the page store
 * @param rowCount how many rows have been written so far
 */
public void writePage(long rowCount) {
    int pageRowCount = Ints.checkedCast(rowCount - rowsWrittenSoFar);
    this.rowsWrittenSoFar = rowCount;
    if (DEBUG)
        LOG.debug("write page");
    try {
        // TODO: rework this API. Those must be called *in that order*
        BytesInput bytes = dataColumn.getBytes();
        Encoding encoding = dataColumn.getEncoding();
        pageWriter.writePageV2(pageRowCount, Ints.checkedCast(statistics.getNumNulls()), valueCount, path.getMaxRepetitionLevel() == 0 ? BytesInput.empty() : repetitionLevelColumn.toBytes(), path.getMaxDefinitionLevel() == 0 ? BytesInput.empty() : definitionLevelColumn.toBytes(), encoding, bytes, statistics);
    } catch (IOException e) {
        throw new ParquetEncodingException("could not write page for " + path, e);
    }
    repetitionLevelColumn.reset();
    definitionLevelColumn.reset();
    dataColumn.reset();
    valueCount = 0;
    resetStatistics();
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) ParquetEncodingException(org.apache.parquet.io.ParquetEncodingException) Encoding(org.apache.parquet.column.Encoding) IOException(java.io.IOException)

Aggregations

BytesInput (org.apache.parquet.bytes.BytesInput)19 Test (org.junit.Test)13 PlainValuesWriter (org.apache.parquet.column.values.plain.PlainValuesWriter)8 ValuesWriter (org.apache.parquet.column.values.ValuesWriter)5 PlainDoubleDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter)5 PlainFloatDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter)5 PlainIntegerDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter)5 PlainLongDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter)5 PlainBinaryDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter)4 FallbackValuesWriter (org.apache.parquet.column.values.fallback.FallbackValuesWriter)4 BinaryPlainValuesReader (org.apache.parquet.column.values.plain.BinaryPlainValuesReader)4 IOException (java.io.IOException)3 ByteBufferInputStream (org.apache.parquet.bytes.ByteBufferInputStream)3 HeapByteBufferAllocator (org.apache.parquet.bytes.HeapByteBufferAllocator)3 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)3 PageWriter (org.apache.parquet.column.page.PageWriter)3 ValuesReader (org.apache.parquet.column.values.ValuesReader)3 Encoding (org.apache.parquet.column.Encoding)2 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)2 PlainValuesReader (org.apache.parquet.column.values.plain.PlainValuesReader)2