Search in sources :

Example 1 with DictionaryPage

use of io.trino.parquet.DictionaryPage in project trino by trinodb.

the class ParquetColumnChunk method readAllPages.

public PageReader readAllPages() throws IOException {
    LinkedList<DataPage> pages = new LinkedList<>();
    DictionaryPage dictionaryPage = null;
    long valueCount = 0;
    int dataPageCount = 0;
    while (hasMorePages(valueCount, dataPageCount)) {
        PageHeader pageHeader = readPageHeader();
        int uncompressedPageSize = pageHeader.getUncompressed_page_size();
        int compressedPageSize = pageHeader.getCompressed_page_size();
        OptionalLong firstRowIndex;
        switch(pageHeader.type) {
            case DICTIONARY_PAGE:
                if (dictionaryPage != null) {
                    throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor());
                }
                dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize);
                break;
            case DATA_PAGE:
                firstRowIndex = PageReader.getFirstRowIndex(dataPageCount, offsetIndex);
                valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages, firstRowIndex);
                ++dataPageCount;
                break;
            case DATA_PAGE_V2:
                firstRowIndex = PageReader.getFirstRowIndex(dataPageCount, offsetIndex);
                valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages, firstRowIndex);
                ++dataPageCount;
                break;
            default:
                input.skip(compressedPageSize);
                advanceIfNecessary();
                break;
        }
    }
    return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage, offsetIndex, valueCount);
}
Also used : ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) DataPage(io.trino.parquet.DataPage) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) DataPageHeader(org.apache.parquet.format.DataPageHeader) PageHeader(org.apache.parquet.format.PageHeader) OptionalLong(java.util.OptionalLong) LinkedList(java.util.LinkedList) DictionaryPage(io.trino.parquet.DictionaryPage)

Example 2 with DictionaryPage

use of io.trino.parquet.DictionaryPage in project trino by trinodb.

the class PredicateUtils method readDictionaryPage.

private static Optional<DictionaryPage> readDictionaryPage(Slice data, CompressionCodecName codecName) {
    try {
        SliceInput inputStream = data.getInput();
        PageHeader pageHeader = Util.readPageHeader(inputStream);
        if (pageHeader.type != PageType.DICTIONARY_PAGE) {
            return Optional.empty();
        }
        Slice compressedData = inputStream.readSlice(pageHeader.getCompressed_page_size());
        DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
        ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
        int dictionarySize = dicHeader.getNum_values();
        return Optional.of(new DictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
    } catch (IOException ignored) {
        return Optional.empty();
    }
}
Also used : PageHeader(org.apache.parquet.format.PageHeader) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) Slice(io.airlift.slice.Slice) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) IOException(java.io.IOException) SliceInput(io.airlift.slice.SliceInput) DictionaryPage(io.trino.parquet.DictionaryPage) ParquetTypeUtils.getParquetEncoding(io.trino.parquet.ParquetTypeUtils.getParquetEncoding) ParquetEncoding(io.trino.parquet.ParquetEncoding)

Example 3 with DictionaryPage

use of io.trino.parquet.DictionaryPage in project trino by trinodb.

the class TupleDomainParquetPredicate method getDomain.

private static Domain getDomain(Type type, DictionaryDescriptor dictionaryDescriptor, DateTimeZone timeZone) {
    if (dictionaryDescriptor == null) {
        return Domain.all(type);
    }
    ColumnDescriptor columnDescriptor = dictionaryDescriptor.getColumnDescriptor();
    Optional<DictionaryPage> dictionaryPage = dictionaryDescriptor.getDictionaryPage();
    if (dictionaryPage.isEmpty()) {
        return Domain.all(type);
    }
    Dictionary dictionary;
    try {
        dictionary = dictionaryPage.get().getEncoding().initDictionary(columnDescriptor, dictionaryPage.get());
    } catch (Exception e) {
        // OK to ignore exception when reading dictionaries
        return Domain.all(type);
    }
    int dictionarySize = dictionaryPage.get().getDictionarySize();
    DictionaryValueConverter converter = new DictionaryValueConverter(dictionary);
    Function<Integer, Object> convertFunction = converter.getConverter(columnDescriptor.getPrimitiveType());
    List<Object> values = new ArrayList<>();
    for (int i = 0; i < dictionarySize; i++) {
        values.add(convertFunction.apply(i));
    }
    // TODO: when min == max (i.e., singleton ranges, the construction of Domains can be done more efficiently
    return getDomain(columnDescriptor, type, values, values, true, timeZone);
}
Also used : Dictionary(io.trino.parquet.dictionary.Dictionary) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) RichColumnDescriptor(io.trino.parquet.RichColumnDescriptor) ArrayList(java.util.ArrayList) DictionaryPage(io.trino.parquet.DictionaryPage) ParquetCorruptionException(io.trino.parquet.ParquetCorruptionException) VerifyException(com.google.common.base.VerifyException)

Example 4 with DictionaryPage

use of io.trino.parquet.DictionaryPage in project trino by trinodb.

the class PrimitiveColumnReader method setPageReader.

public void setPageReader(PageReader pageReader, RowRanges rowRanges) {
    this.pageReader = requireNonNull(pageReader, "pageReader");
    DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
    if (dictionaryPage != null) {
        try {
            dictionary = dictionaryPage.getEncoding().initDictionary(columnDescriptor, dictionaryPage);
        } catch (IOException e) {
            throw new ParquetDecodingException("could not decode the dictionary for " + columnDescriptor, e);
        }
    } else {
        dictionary = null;
    }
    checkArgument(pageReader.getTotalValueCount() > 0, "page is empty");
    totalValueCount = pageReader.getTotalValueCount();
    if (rowRanges != null) {
        indexIterator = rowRanges.iterator();
        // If rowRanges is empty for a row-group, then no page needs to be read, and we should not reach here
        checkArgument(indexIterator.hasNext(), "rowRanges is empty");
        targetRow = indexIterator.next();
    }
}
Also used : ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) IOException(java.io.IOException) DictionaryPage(io.trino.parquet.DictionaryPage)

Aggregations

DictionaryPage (io.trino.parquet.DictionaryPage)4 ParquetCorruptionException (io.trino.parquet.ParquetCorruptionException)2 IOException (java.io.IOException)2 DictionaryPageHeader (org.apache.parquet.format.DictionaryPageHeader)2 PageHeader (org.apache.parquet.format.PageHeader)2 VerifyException (com.google.common.base.VerifyException)1 Slice (io.airlift.slice.Slice)1 SliceInput (io.airlift.slice.SliceInput)1 DataPage (io.trino.parquet.DataPage)1 ParquetEncoding (io.trino.parquet.ParquetEncoding)1 ParquetTypeUtils.getParquetEncoding (io.trino.parquet.ParquetTypeUtils.getParquetEncoding)1 RichColumnDescriptor (io.trino.parquet.RichColumnDescriptor)1 Dictionary (io.trino.parquet.dictionary.Dictionary)1 ArrayList (java.util.ArrayList)1 LinkedList (java.util.LinkedList)1 OptionalLong (java.util.OptionalLong)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 DataPageHeader (org.apache.parquet.format.DataPageHeader)1 ParquetDecodingException (org.apache.parquet.io.ParquetDecodingException)1