use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class DictionaryValuesWriter method getBytes.
@Override
public BytesInput getBytes() {
int maxDicId = getDictionarySize() - 1;
LOG.debug("max dic id {}", maxDicId);
int bitWidth = BytesUtils.getWidthFromMaxInt(maxDicId);
int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(MIN_INITIAL_SLAB_SIZE, maxDictionaryByteSize, 10);
RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(bitWidth, initialSlabSize, maxDictionaryByteSize, this.allocator);
encoders.add(encoder);
IntIterator iterator = encodedValues.iterator();
try {
while (iterator.hasNext()) {
encoder.writeInt(iterator.next());
}
// encodes the bit width
byte[] bytesHeader = new byte[] { (byte) bitWidth };
BytesInput rleEncodedBytes = encoder.toBytes();
LOG.debug("rle encoded bytes {}", rleEncodedBytes.size());
BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes);
// remember size of dictionary when we last wrote a page
lastUsedDictionarySize = getDictionarySize();
lastUsedDictionaryByteSize = dictionaryByteSize;
return bytes;
} catch (IOException e) {
throw new ParquetEncodingException("could not encode the values", e);
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method testBinaryDictionary.
@Test
public void testBinaryDictionary() throws IOException {
int COUNT = 100;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
writeRepeated(COUNT, cw, "a");
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
writeRepeated(COUNT, cw, "b");
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
// now we will fall back
writeDistinct(COUNT, cw, "c");
BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
DictionaryValuesReader cr = initDicReader(cw, BINARY);
checkRepeated(COUNT, bytes1, cr, "a");
checkRepeated(COUNT, bytes2, cr, "b");
BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes3, cr2, "c");
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method testFloatDictionary.
@Test
public void testFloatDictionary() throws IOException {
int COUNT = 2000;
int COUNT2 = 4000;
final FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw = newPlainFloatDictionaryValuesWriter(10000, 10000);
for (float i = 0; i < COUNT; i++) {
cw.writeFloat(i % 50);
}
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
for (float i = COUNT2; i > 0; i--) {
cw.writeFloat(i % 50);
}
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
DictionaryValuesReader cr = initDicReader(cw, FLOAT);
cr.initFromPage(COUNT, bytes1.toInputStream());
for (float i = 0; i < COUNT; i++) {
float back = cr.readFloat();
assertEquals(i % 50, back, 0.0f);
}
cr.initFromPage(COUNT2, bytes2.toInputStream());
for (float i = COUNT2; i > 0; i--) {
float back = cr.readFloat();
assertEquals(i % 50, back, 0.0f);
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method testBinaryDictionaryChangedValues.
@Test
public void testBinaryDictionaryChangedValues() throws IOException {
int COUNT = 100;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
writeRepeatedWithReuse(COUNT, cw, "a");
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
writeRepeatedWithReuse(COUNT, cw, "b");
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
// now we will fall back
writeDistinct(COUNT, cw, "c");
BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
DictionaryValuesReader cr = initDicReader(cw, BINARY);
checkRepeated(COUNT, bytes1, cr, "a");
checkRepeated(COUNT, bytes2, cr, "b");
BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes3, cr2, "c");
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method testIntDictionary.
@Test
public void testIntDictionary() throws IOException {
int COUNT = 2000;
int COUNT2 = 4000;
final FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(10000, 10000);
for (int i = 0; i < COUNT; i++) {
cw.writeInteger(i % 50);
}
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
for (int i = COUNT2; i > 0; i--) {
cw.writeInteger(i % 50);
}
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
DictionaryValuesReader cr = initDicReader(cw, INT32);
cr.initFromPage(COUNT, bytes1.toInputStream());
for (int i = 0; i < COUNT; i++) {
int back = cr.readInteger();
assertEquals(i % 50, back);
}
cr.initFromPage(COUNT2, bytes2.toInputStream());
for (int i = COUNT2; i > 0; i--) {
int back = cr.readInteger();
assertEquals(i % 50, back);
}
}
Aggregations