use of org.apache.parquet.bytes.ByteBufferInputStream in project presto by prestodb.
the class Decoders method createDefinitionLevelDecoder.
public static final DefinitionLevelDecoder createDefinitionLevelDecoder(ParquetEncoding encoding, int maxLevelValue, int valueCount, ByteBuffer buffer) throws IOException {
final int bitWidth = getWidthFromMaxInt(maxLevelValue);
if (maxLevelValue == 0 || bitWidth == 0) {
return new DefinitionLevelDecoder(0, valueCount);
}
checkArgument(encoding == RLE, "Invalid definition level encoding: " + encoding);
ByteBufferInputStream bufferInputStream = ByteBufferInputStream.wrap(buffer);
final int bufferSize = readIntLittleEndian(bufferInputStream);
DefinitionLevelDecoder definitionLevelDecoder = new DefinitionLevelDecoder(valueCount, bitWidth, bufferInputStream.sliceStream(bufferSize));
buffer.position(buffer.position() + bufferSize + 4);
return definitionLevelDecoder;
}
use of org.apache.parquet.bytes.ByteBufferInputStream in project presto by prestodb.
the class Decoders method createValuesDecoder.
private static final ValuesDecoder createValuesDecoder(ColumnDescriptor columnDescriptor, Dictionary dictionary, int valueCount, ParquetEncoding encoding, byte[] buffer, int offset, int length) throws IOException {
final PrimitiveTypeName type = columnDescriptor.getPrimitiveType().getPrimitiveTypeName();
if (encoding == PLAIN) {
switch(type) {
case BOOLEAN:
return new BooleanPlainValuesDecoder(buffer, offset, length);
case INT32:
case FLOAT:
return new Int32PlainValuesDecoder(buffer, offset, length);
case INT64:
{
if (isTimeStampMicrosType(columnDescriptor)) {
return new Int64TimestampMicrosPlainValuesDecoder(buffer, offset, length);
}
}
case DOUBLE:
return new Int64PlainValuesDecoder(buffer, offset, length);
case INT96:
return new TimestampPlainValuesDecoder(buffer, offset, length);
case BINARY:
return new BinaryPlainValuesDecoder(buffer, offset, length);
case FIXED_LEN_BYTE_ARRAY:
default:
throw new PrestoException(PARQUET_UNSUPPORTED_COLUMN_TYPE, format("Column: %s, Encoding: %s", columnDescriptor, encoding));
}
}
if (encoding == RLE && type == BOOLEAN) {
ByteBuffer byteBuffer = ByteBuffer.wrap(buffer, offset, length);
// skip past the length
byteBuffer.getInt();
return new BooleanRLEValuesDecoder(byteBuffer);
}
if (encoding == RLE_DICTIONARY || encoding == PLAIN_DICTIONARY) {
InputStream inputStream = ByteBufferInputStream.wrap(ByteBuffer.wrap(buffer, offset, length));
int bitWidth = readIntLittleEndianOnOneByte(inputStream);
switch(type) {
case INT32:
case FLOAT:
{
return new Int32RLEDictionaryValuesDecoder(bitWidth, inputStream, (IntegerDictionary) dictionary);
}
case INT64:
{
if (isTimeStampMicrosType(columnDescriptor)) {
return new Int64TimestampMicrosRLEDictionaryValuesDecoder(bitWidth, inputStream, (LongDictionary) dictionary);
}
}
case DOUBLE:
{
return new Int64RLEDictionaryValuesDecoder(bitWidth, inputStream, (LongDictionary) dictionary);
}
case INT96:
{
return new TimestampRLEDictionaryValuesDecoder(bitWidth, inputStream, (TimestampDictionary) dictionary);
}
case BINARY:
{
return new BinaryRLEDictionaryValuesDecoder(bitWidth, inputStream, (BinaryBatchDictionary) dictionary);
}
case FIXED_LEN_BYTE_ARRAY:
default:
throw new PrestoException(PARQUET_UNSUPPORTED_COLUMN_TYPE, format("Column: %s, Encoding: %s", columnDescriptor, encoding));
}
}
if (encoding == DELTA_BINARY_PACKED) {
ByteBufferInputStream inputStream = ByteBufferInputStream.wrap(ByteBuffer.wrap(buffer, offset, length));
switch(type) {
case INT32:
case FLOAT:
{
return new Int32DeltaBinaryPackedValuesDecoder(valueCount, inputStream);
}
case INT64:
{
if (isTimeStampMicrosType(columnDescriptor)) {
return new Int64TimestampMicrosDeltaBinaryPackedValuesDecoder(valueCount, inputStream);
}
}
case DOUBLE:
{
return new Int64DeltaBinaryPackedValuesDecoder(valueCount, inputStream);
}
default:
throw new PrestoException(PARQUET_UNSUPPORTED_COLUMN_TYPE, format("Column: %s, Encoding: %s", columnDescriptor, encoding));
}
}
if ((encoding == DELTA_BYTE_ARRAY || encoding == DELTA_LENGTH_BYTE_ARRAY) && type == PrimitiveTypeName.BINARY) {
ByteBufferInputStream inputStream = ByteBufferInputStream.wrap(ByteBuffer.wrap(buffer, offset, length));
return new BinaryDeltaValuesDecoder(encoding, valueCount, inputStream);
}
throw new PrestoException(PARQUET_UNSUPPORTED_ENCODING, format("Column: %s, Encoding: %s", columnDescriptor, encoding));
}
use of org.apache.parquet.bytes.ByteBufferInputStream in project presto by prestodb.
the class AbstractColumnReader method readPageV1.
private ValuesReader readPageV1(DataPageV1 page) {
ValuesReader repetitionLevelReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL);
ValuesReader definitionLevelReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL);
repetitionReader = new LevelValuesReader(repetitionLevelReader);
definitionReader = new LevelValuesReader(definitionLevelReader);
try {
ByteBufferInputStream bufferInputStream = ByteBufferInputStream.wrap(page.getSlice().toByteBuffer());
repetitionLevelReader.initFromPage(page.getValueCount(), bufferInputStream);
definitionLevelReader.initFromPage(page.getValueCount(), bufferInputStream);
long firstRowIndex = page.getFirstRowIndex().orElse(-1L);
return initDataReader(page.getValueEncoding(), bufferInputStream, page.getValueCount(), firstRowIndex);
} catch (IOException e) {
throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e);
}
}
use of org.apache.parquet.bytes.ByteBufferInputStream in project flink by apache.
the class AbstractColumnReader method readPageV1.
private void readPageV1(DataPageV1 page) throws IOException {
this.pageValueCount = page.getValueCount();
ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);
// Initialize the decoders.
if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) {
throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding());
}
int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
this.runLenDecoder = new RunLengthDecoder(bitWidth);
try {
BytesInput bytes = page.getBytes();
ByteBufferInputStream in = bytes.toInputStream();
rlReader.initFromPage(pageValueCount, in);
this.runLenDecoder.initFromStream(pageValueCount, in);
prepareNewPage(page.getValueEncoding(), in);
} catch (IOException e) {
throw new IOException("could not read page " + page + " in col " + descriptor, e);
}
}
use of org.apache.parquet.bytes.ByteBufferInputStream in project hive by apache.
the class BaseVectorizedColumnReader method readPageV1.
private void readPageV1(DataPageV1 page) {
ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);
ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL);
this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
try {
BytesInput bytes = page.getBytes();
LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records");
ByteBufferInputStream in = bytes.toInputStream();
LOG.debug("reading repetition levels at " + in.position());
rlReader.initFromPage(pageValueCount, in);
LOG.debug("reading definition levels at " + in.position());
dlReader.initFromPage(pageValueCount, in);
LOG.debug("reading data at " + in.position());
initDataReader(page.getValueEncoding(), in, page.getValueCount());
} catch (IOException e) {
throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
}
}
Aggregations