Search in sources :

Example 1 with DictionaryValuesReader

use of org.apache.parquet.column.values.dictionary.DictionaryValuesReader in project drill by apache.

the class PageReader method next.

/**
   * Grab the next page.
   *
   * @return - if another page was present
   * @throws IOException
   */
public boolean next() throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    currentPageCount = -1;
    valuesRead = 0;
    valuesReadyToRead = 0;
    // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
    // and submit a bug report
    long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
    if (parentColumnReader.totalValuesRead >= totalValueCount) {
        return false;
    }
    clearBuffers();
    nextInternal();
    if (pageData == null || pageHeader == null) {
        //TODO: Is this an error condition or a normal condition??
        return false;
    }
    timer.start();
    currentPageCount = pageHeader.data_page_header.num_values;
    final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
    final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
    final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);
    byteLength = pageHeader.uncompressed_page_size;
    final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, pageData.capacity());
    readPosInBytes = 0;
    if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
        repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
        repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
        // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
        // read the first zero here to simplify the reading processes, and start reading the first value the same as all
        // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
        // the first list of repetition levels
        readPosInBytes = repetitionLevels.getNextOffset();
        repetitionLevels.readInteger();
    }
    if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
        parentColumnReader.currDefLevel = -1;
        definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
        definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        readPosInBytes = definitionLevels.getNextOffset();
        if (!valueEncoding.usesDictionary()) {
            valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
            valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        }
    }
    if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
        valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
        valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    }
    if (valueEncoding.usesDictionary()) {
        // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
        // actually copying the values out into the vectors
        dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
        dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        dictionaryValueReader = new DictionaryValuesReader(dictionary);
        dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        parentColumnReader.usingDictionary = true;
    } else {
        parentColumnReader.usingDictionary = false;
    }
    // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
    // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
    // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
    // definition and repetition level data which is stored alongside the page data itself
    readyToReadPosInBytes = readPosInBytes;
    long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
    stats.numDataPagesDecoded.incrementAndGet();
    stats.timeDataPageDecode.addAndGet(timeDecode);
    return true;
}
Also used : Stopwatch(com.google.common.base.Stopwatch) Encoding(org.apache.parquet.column.Encoding) DictionaryValuesReader(org.apache.parquet.column.values.dictionary.DictionaryValuesReader) ByteBuffer(java.nio.ByteBuffer)

Example 2 with DictionaryValuesReader

use of org.apache.parquet.column.values.dictionary.DictionaryValuesReader in project drill by axbaretto.

the class PageReader method next.

/**
 * Grab the next page.
 *
 * @return - if another page was present
 * @throws IOException
 */
public boolean next() throws IOException {
    Stopwatch timer = Stopwatch.createUnstarted();
    currentPageCount = -1;
    valuesRead = 0;
    valuesReadyToRead = 0;
    // TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
    // and submit a bug report
    long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
    if (parentColumnReader.totalValuesRead >= totalValueCount) {
        return false;
    }
    clearBuffers();
    nextInternal();
    if (pageData == null || pageHeader == null) {
        // TODO: Is this an error condition or a normal condition??
        return false;
    }
    timer.start();
    currentPageCount = pageHeader.data_page_header.num_values;
    final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
    final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
    final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);
    byteLength = pageHeader.uncompressed_page_size;
    final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, pageData.capacity());
    readPosInBytes = 0;
    if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
        repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
        repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        // we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
        // a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
        // read the first zero here to simplify the reading processes, and start reading the first value the same as all
        // of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
        // the first list of repetition levels
        readPosInBytes = repetitionLevels.getNextOffset();
        repetitionLevels.readInteger();
    }
    if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0) {
        parentColumnReader.currDefLevel = -1;
        definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
        definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        readPosInBytes = definitionLevels.getNextOffset();
        if (!valueEncoding.usesDictionary()) {
            valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
            valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        }
    }
    if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
        valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
        valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
    }
    if (valueEncoding.usesDictionary()) {
        // initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
        // actually copying the values out into the vectors
        dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
        dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        dictionaryValueReader = new DictionaryValuesReader(dictionary);
        dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
        parentColumnReader.usingDictionary = true;
    } else {
        parentColumnReader.usingDictionary = false;
    }
    // readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
    // readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
    // fit one record at a time, such as for variable length data. Both operations must start in the same location after the
    // definition and repetition level data which is stored alongside the page data itself
    readyToReadPosInBytes = readPosInBytes;
    long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
    stats.numDataPagesDecoded.incrementAndGet();
    stats.timeDataPageDecode.addAndGet(timeDecode);
    return true;
}
Also used : Stopwatch(com.google.common.base.Stopwatch) Encoding(org.apache.parquet.column.Encoding) DictionaryValuesReader(org.apache.parquet.column.values.dictionary.DictionaryValuesReader) ByteBuffer(java.nio.ByteBuffer)

Aggregations

Stopwatch (com.google.common.base.Stopwatch)2 ByteBuffer (java.nio.ByteBuffer)2 Encoding (org.apache.parquet.column.Encoding)2 DictionaryValuesReader (org.apache.parquet.column.values.dictionary.DictionaryValuesReader)2