Search in sources :

Example 1 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class DictionaryValuesWriter method getBytes.

@Override
public BytesInput getBytes() {
    int maxDicId = getDictionarySize() - 1;
    LOG.debug("max dic id {}", maxDicId);
    int bitWidth = BytesUtils.getWidthFromMaxInt(maxDicId);
    int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(MIN_INITIAL_SLAB_SIZE, maxDictionaryByteSize, 10);
    RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(bitWidth, initialSlabSize, maxDictionaryByteSize, this.allocator);
    encoders.add(encoder);
    IntIterator iterator = encodedValues.iterator();
    try {
        while (iterator.hasNext()) {
            encoder.writeInt(iterator.next());
        }
        // encodes the bit width
        byte[] bytesHeader = new byte[] { (byte) bitWidth };
        BytesInput rleEncodedBytes = encoder.toBytes();
        LOG.debug("rle encoded bytes {}", rleEncodedBytes.size());
        BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes);
        // remember size of dictionary when we last wrote a page
        lastUsedDictionarySize = getDictionarySize();
        lastUsedDictionaryByteSize = dictionaryByteSize;
        return bytes;
    } catch (IOException e) {
        throw new ParquetEncodingException("could not encode the values", e);
    }
}
Also used : IntIterator(org.apache.parquet.column.values.dictionary.IntList.IntIterator) BytesInput(org.apache.parquet.bytes.BytesInput) ParquetEncodingException(org.apache.parquet.io.ParquetEncodingException) IOException(java.io.IOException) RunLengthBitPackingHybridEncoder(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridEncoder)

Example 2 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method testBinaryDictionary.

@Test
public void testBinaryDictionary() throws IOException {
    int COUNT = 100;
    ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
    writeRepeated(COUNT, cw, "a");
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    writeRepeated(COUNT, cw, "b");
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    // now we will fall back
    writeDistinct(COUNT, cw, "c");
    BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
    DictionaryValuesReader cr = initDicReader(cw, BINARY);
    checkRepeated(COUNT, bytes1, cr, "a");
    checkRepeated(COUNT, bytes2, cr, "b");
    BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
    checkDistinct(COUNT, bytes3, cr2, "c");
}
Also used : BinaryPlainValuesReader(org.apache.parquet.column.values.plain.BinaryPlainValuesReader) BytesInput(org.apache.parquet.bytes.BytesInput) PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) PlainIntegerDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter) PlainFloatDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) PlainBinaryDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter) PlainLongDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter) FallbackValuesWriter(org.apache.parquet.column.values.fallback.FallbackValuesWriter) PlainDoubleDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter) Test(org.junit.Test)

Example 3 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method testFloatDictionary.

@Test
public void testFloatDictionary() throws IOException {
    int COUNT = 2000;
    int COUNT2 = 4000;
    final FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw = newPlainFloatDictionaryValuesWriter(10000, 10000);
    for (float i = 0; i < COUNT; i++) {
        cw.writeFloat(i % 50);
    }
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    assertEquals(50, cw.initialWriter.getDictionarySize());
    for (float i = COUNT2; i > 0; i--) {
        cw.writeFloat(i % 50);
    }
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    assertEquals(50, cw.initialWriter.getDictionarySize());
    DictionaryValuesReader cr = initDicReader(cw, FLOAT);
    cr.initFromPage(COUNT, bytes1.toInputStream());
    for (float i = 0; i < COUNT; i++) {
        float back = cr.readFloat();
        assertEquals(i % 50, back, 0.0f);
    }
    cr.initFromPage(COUNT2, bytes2.toInputStream());
    for (float i = COUNT2; i > 0; i--) {
        float back = cr.readFloat();
        assertEquals(i % 50, back, 0.0f);
    }
}
Also used : PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) BytesInput(org.apache.parquet.bytes.BytesInput) PlainFloatDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter) Test(org.junit.Test)

Example 4 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method testBinaryDictionaryChangedValues.

@Test
public void testBinaryDictionaryChangedValues() throws IOException {
    int COUNT = 100;
    ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
    writeRepeatedWithReuse(COUNT, cw, "a");
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    writeRepeatedWithReuse(COUNT, cw, "b");
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    // now we will fall back
    writeDistinct(COUNT, cw, "c");
    BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
    DictionaryValuesReader cr = initDicReader(cw, BINARY);
    checkRepeated(COUNT, bytes1, cr, "a");
    checkRepeated(COUNT, bytes2, cr, "b");
    BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
    checkDistinct(COUNT, bytes3, cr2, "c");
}
Also used : BinaryPlainValuesReader(org.apache.parquet.column.values.plain.BinaryPlainValuesReader) BytesInput(org.apache.parquet.bytes.BytesInput) PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) PlainIntegerDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter) PlainFloatDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) PlainBinaryDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter) PlainLongDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter) FallbackValuesWriter(org.apache.parquet.column.values.fallback.FallbackValuesWriter) PlainDoubleDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter) Test(org.junit.Test)

Example 5 with BytesInput

use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.

the class TestDictionary method testIntDictionary.

@Test
public void testIntDictionary() throws IOException {
    int COUNT = 2000;
    int COUNT2 = 4000;
    final FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(10000, 10000);
    for (int i = 0; i < COUNT; i++) {
        cw.writeInteger(i % 50);
    }
    BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    assertEquals(50, cw.initialWriter.getDictionarySize());
    for (int i = COUNT2; i > 0; i--) {
        cw.writeInteger(i % 50);
    }
    BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
    assertEquals(50, cw.initialWriter.getDictionarySize());
    DictionaryValuesReader cr = initDicReader(cw, INT32);
    cr.initFromPage(COUNT, bytes1.toInputStream());
    for (int i = 0; i < COUNT; i++) {
        int back = cr.readInteger();
        assertEquals(i % 50, back);
    }
    cr.initFromPage(COUNT2, bytes2.toInputStream());
    for (int i = COUNT2; i > 0; i--) {
        int back = cr.readInteger();
        assertEquals(i % 50, back);
    }
}
Also used : PlainValuesWriter(org.apache.parquet.column.values.plain.PlainValuesWriter) PlainIntegerDictionaryValuesWriter(org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter) BytesInput(org.apache.parquet.bytes.BytesInput) Test(org.junit.Test)

Aggregations

BytesInput (org.apache.parquet.bytes.BytesInput)19 Test (org.junit.Test)13 PlainValuesWriter (org.apache.parquet.column.values.plain.PlainValuesWriter)8 ValuesWriter (org.apache.parquet.column.values.ValuesWriter)5 PlainDoubleDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter)5 PlainFloatDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter)5 PlainIntegerDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter)5 PlainLongDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter)5 PlainBinaryDictionaryValuesWriter (org.apache.parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter)4 FallbackValuesWriter (org.apache.parquet.column.values.fallback.FallbackValuesWriter)4 BinaryPlainValuesReader (org.apache.parquet.column.values.plain.BinaryPlainValuesReader)4 IOException (java.io.IOException)3 ByteBufferInputStream (org.apache.parquet.bytes.ByteBufferInputStream)3 HeapByteBufferAllocator (org.apache.parquet.bytes.HeapByteBufferAllocator)3 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)3 PageWriter (org.apache.parquet.column.page.PageWriter)3 ValuesReader (org.apache.parquet.column.values.ValuesReader)3 Encoding (org.apache.parquet.column.Encoding)2 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)2 PlainValuesReader (org.apache.parquet.column.values.plain.PlainValuesReader)2