use of org.apache.parquet.column.values.dictionary.DictionaryValuesReader in project drill by apache.
the class PageReader method next.
/**
* Grab the next page.
*
* @return - if another page was present
* @throws IOException
*/
public boolean next() throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
currentPageCount = -1;
valuesRead = 0;
valuesReadyToRead = 0;
// TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
// and submit a bug report
long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
if (parentColumnReader.totalValuesRead >= totalValueCount) {
return false;
}
clearBuffers();
nextInternal();
if (pageData == null || pageHeader == null) {
//TODO: Is this an error condition or a normal condition??
return false;
}
timer.start();
currentPageCount = pageHeader.data_page_header.num_values;
final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);
byteLength = pageHeader.uncompressed_page_size;
final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, pageData.capacity());
readPosInBytes = 0;
if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
// we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
// a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
// read the first zero here to simplify the reading processes, and start reading the first value the same as all
// of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
// the first list of repetition levels
readPosInBytes = repetitionLevels.getNextOffset();
repetitionLevels.readInteger();
}
if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
parentColumnReader.currDefLevel = -1;
definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
readPosInBytes = definitionLevels.getNextOffset();
if (!valueEncoding.usesDictionary()) {
valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
}
}
if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
}
if (valueEncoding.usesDictionary()) {
// initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
// actually copying the values out into the vectors
dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
dictionaryValueReader = new DictionaryValuesReader(dictionary);
dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
parentColumnReader.usingDictionary = true;
} else {
parentColumnReader.usingDictionary = false;
}
// readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
// readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
// fit one record at a time, such as for variable length data. Both operations must start in the same location after the
// definition and repetition level data which is stored alongside the page data itself
readyToReadPosInBytes = readPosInBytes;
long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
stats.numDataPagesDecoded.incrementAndGet();
stats.timeDataPageDecode.addAndGet(timeDecode);
return true;
}
use of org.apache.parquet.column.values.dictionary.DictionaryValuesReader in project drill by axbaretto.
the class PageReader method next.
/**
* Grab the next page.
*
* @return - if another page was present
* @throws IOException
*/
public boolean next() throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
currentPageCount = -1;
valuesRead = 0;
valuesReadyToRead = 0;
// TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
// and submit a bug report
long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
if (parentColumnReader.totalValuesRead >= totalValueCount) {
return false;
}
clearBuffers();
nextInternal();
if (pageData == null || pageHeader == null) {
// TODO: Is this an error condition or a normal condition??
return false;
}
timer.start();
currentPageCount = pageHeader.data_page_header.num_values;
final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);
byteLength = pageHeader.uncompressed_page_size;
final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, pageData.capacity());
readPosInBytes = 0;
if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
// we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
// a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
// read the first zero here to simplify the reading processes, and start reading the first value the same as all
// of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
// the first list of repetition levels
readPosInBytes = repetitionLevels.getNextOffset();
repetitionLevels.readInteger();
}
if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
parentColumnReader.currDefLevel = -1;
definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
readPosInBytes = definitionLevels.getNextOffset();
if (!valueEncoding.usesDictionary()) {
valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
}
}
if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
}
if (valueEncoding.usesDictionary()) {
// initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
// actually copying the values out into the vectors
dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
dictionaryValueReader = new DictionaryValuesReader(dictionary);
dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
parentColumnReader.usingDictionary = true;
} else {
parentColumnReader.usingDictionary = false;
}
// readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
// readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
// fit one record at a time, such as for variable length data. Both operations must start in the same location after the
// definition and repetition level data which is stored alongside the page data itself
readyToReadPosInBytes = readPosInBytes;
long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
stats.numDataPagesDecoded.incrementAndGet();
stats.timeDataPageDecode.addAndGet(timeDecode);
return true;
}
Aggregations