use of org.apache.parquet.column.values.ValuesReader in project parquet-mr by apache.
the class TestBitPackingColumn method validateEncodeDecode.
private void validateEncodeDecode(int bitLength, int[] vals, String expected) throws IOException {
for (PACKING_TYPE type : PACKING_TYPE.values()) {
LOG.debug("{}", type);
final int bound = (int) Math.pow(2, bitLength) - 1;
ValuesWriter w = type.getWriter(bound);
for (int i : vals) {
w.writeInteger(i);
}
byte[] bytes = w.getBytes().toByteArray();
LOG.debug("vals (" + bitLength + "): " + TestBitPacking.toString(vals));
LOG.debug("bytes: {}", TestBitPacking.toString(bytes));
assertEquals(type.toString(), expected, TestBitPacking.toString(bytes));
ValuesReader r = type.getReader(bound);
r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
int[] result = new int[vals.length];
for (int i = 0; i < result.length; i++) {
result[i] = r.readInteger();
}
LOG.debug("result: {}", TestBitPacking.toString(result));
assertArrayEquals(type + " result: " + TestBitPacking.toString(result), vals, result);
}
}
use of org.apache.parquet.column.values.ValuesReader in project parquet-mr by apache.
the class TestDictionary method testFirstPageFallBack.
@Test
public void testFirstPageFallBack() throws IOException {
int COUNT = 1000;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(10000, 10000);
writeDistinct(COUNT, cw, "a");
// not efficient so falls back
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN);
writeRepeated(COUNT, cw, "b");
// still plain because we fell back on first page
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
ValuesReader cr = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes1, cr, "a");
checkRepeated(COUNT, bytes2, cr, "b");
}
use of org.apache.parquet.column.values.ValuesReader in project parquet-mr by apache.
the class TestDictionary method testBinaryDictionaryFallBack.
@Test
public void testBinaryDictionaryFallBack() throws IOException {
int slabSize = 100;
int maxDictionaryByteSize = 50;
final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
int fallBackThreshold = maxDictionaryByteSize;
int dataSize = 0;
for (long i = 0; i < 100; i++) {
Binary binary = Binary.fromString("str" + i);
cw.writeBytes(binary);
dataSize += (binary.length() + 4);
if (dataSize < fallBackThreshold) {
assertEquals(PLAIN_DICTIONARY, cw.getEncoding());
} else {
assertEquals(PLAIN, cw.getEncoding());
}
}
// Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
ValuesReader reader = new BinaryPlainValuesReader();
reader.initFromPage(100, cw.getBytes().toInputStream());
for (long i = 0; i < 100; i++) {
assertEquals(Binary.fromString("str" + i), reader.readBytes());
}
// simulate cutting the page
cw.reset();
assertEquals(0, cw.getBufferedSize());
}
use of org.apache.parquet.column.values.ValuesReader in project parquet-mr by apache.
the class TestDictionary method testSecondPageFallBack.
@Test
public void testSecondPageFallBack() throws IOException {
int COUNT = 1000;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
writeRepeated(COUNT, cw, "a");
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
writeDistinct(COUNT, cw, "b");
// not efficient so falls back
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
writeRepeated(COUNT, cw, "a");
// still plain because we fell back on previous page
BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
ValuesReader cr = initDicReader(cw, BINARY);
checkRepeated(COUNT, bytes1, cr, "a");
cr = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes2, cr, "b");
checkRepeated(COUNT, bytes3, cr, "a");
}
use of org.apache.parquet.column.values.ValuesReader in project parquet-mr by apache.
the class ColumnReaderImpl method initDataReader.
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
ValuesReader previousReader = this.dataColumn;
this.currentEncoding = dataEncoding;
this.pageValueCount = valueCount;
this.endOfPageValueCount = readValues + pageValueCount;
if (dataEncoding.usesDictionary()) {
if (dictionary == null) {
throw new ParquetDecodingException("could not read page in col " + path + " as the dictionary was missing for encoding " + dataEncoding);
}
this.dataColumn = dataEncoding.getDictionaryBasedValuesReader(path, VALUES, dictionary);
} else {
this.dataColumn = dataEncoding.getValuesReader(path, VALUES);
}
if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
bindToDictionary(dictionary);
} else {
bind(path.getType());
}
try {
dataColumn.initFromPage(pageValueCount, in);
} catch (IOException e) {
throw new ParquetDecodingException("could not read page in col " + path, e);
}
if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && previousReader != null && previousReader instanceof RequiresPreviousReader) {
// previous reader can only be set if reading sequentially
((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader);
}
}
Aggregations