Search in sources :

Example 1 with ParquetDictionary

use of org.apache.flink.formats.parquet.vector.ParquetDictionary in project flink by apache.

the class AbstractColumnReader method readToVector.

/**
 * Reads `total` values from this columnReader into column.
 */
@Override
public final void readToVector(int readNumber, VECTOR vector) throws IOException {
    int rowId = 0;
    WritableIntVector dictionaryIds = null;
    if (dictionary != null) {
        dictionaryIds = vector.reserveDictionaryIds(readNumber);
    }
    while (readNumber > 0) {
        // Compute the number of values we want to read in this page.
        int leftInPage = (int) (endOfPageValueCount - valuesRead);
        if (leftInPage == 0) {
            DataPage page = pageReader.readPage();
            if (page instanceof DataPageV1) {
                readPageV1((DataPageV1) page);
            } else if (page instanceof DataPageV2) {
                readPageV2((DataPageV2) page);
            } else {
                throw new RuntimeException("Unsupported page type: " + page.getClass());
            }
            leftInPage = (int) (endOfPageValueCount - valuesRead);
        }
        int num = Math.min(readNumber, leftInPage);
        if (isCurrentPageDictionaryEncoded) {
            // Read and decode dictionary ids.
            runLenDecoder.readDictionaryIds(num, dictionaryIds, vector, rowId, maxDefLevel, this.dictionaryIdsDecoder);
            if (vector.hasDictionary() || (rowId == 0 && supportLazyDecode())) {
                // Column vector supports lazy decoding of dictionary values so just set the
                // dictionary.
                // We can't do this if rowId != 0 AND the column doesn't have a dictionary (i.e.
                // some
                // non-dictionary encoded values have already been added).
                vector.setDictionary(new ParquetDictionary(dictionary));
            } else {
                readBatchFromDictionaryIds(rowId, num, vector, dictionaryIds);
            }
        } else {
            if (vector.hasDictionary() && rowId != 0) {
                // This batch already has dictionary encoded values but this new page is not.
                // The batch
                // does not support a mix of dictionary and not so we will decode the
                // dictionary.
                readBatchFromDictionaryIds(0, rowId, vector, vector.getDictionaryIds());
            }
            vector.setDictionary(null);
            readBatch(rowId, num, vector);
        }
        valuesRead += num;
        rowId += num;
        readNumber -= num;
    }
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) DataPageV2(org.apache.parquet.column.page.DataPageV2) DataPageV1(org.apache.parquet.column.page.DataPageV1) ParquetDictionary(org.apache.flink.formats.parquet.vector.ParquetDictionary) WritableIntVector(org.apache.flink.table.data.columnar.vector.writable.WritableIntVector)

Aggregations

ParquetDictionary (org.apache.flink.formats.parquet.vector.ParquetDictionary)1 WritableIntVector (org.apache.flink.table.data.columnar.vector.writable.WritableIntVector)1 DataPage (org.apache.parquet.column.page.DataPage)1 DataPageV1 (org.apache.parquet.column.page.DataPageV1)1 DataPageV2 (org.apache.parquet.column.page.DataPageV2)1