Search in sources :

Example 1 with ParquetDictionaryPage

use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.

the class TupleDomainParquetPredicate method getDomain.

@VisibleForTesting
public static Domain getDomain(Type type, ParquetDictionaryDescriptor dictionaryDescriptor) {
    if (dictionaryDescriptor == null) {
        return null;
    }
    ColumnDescriptor columnDescriptor = dictionaryDescriptor.getColumnDescriptor();
    Optional<ParquetDictionaryPage> dictionaryPage = dictionaryDescriptor.getDictionaryPage();
    if (!dictionaryPage.isPresent()) {
        return null;
    }
    ParquetDictionary dictionary;
    try {
        dictionary = dictionaryPage.get().getEncoding().initDictionary(columnDescriptor, dictionaryPage.get());
    } catch (Exception e) {
        // OK to ignore exception when reading dictionaries
        return null;
    }
    int dictionarySize = dictionaryPage.get().getDictionarySize();
    if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT64) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, dictionary.decodeToLong(i)));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    } else if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT32) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, (long) dictionary.decodeToInt(i)));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    } else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.DOUBLE) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, dictionary.decodeToDouble(i)));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    } else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.FLOAT) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, (double) dictionary.decodeToFloat(i)));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    } else if (isVarcharType(type) && columnDescriptor.getType() == PrimitiveTypeName.BINARY) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, Slices.wrappedBuffer(dictionary.decodeToBinary(i).getBytes())));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    }
    return null;
}
Also used : ParquetDictionaryPage(com.facebook.presto.hive.parquet.ParquetDictionaryPage) ColumnDescriptor(parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) ParquetDictionary(com.facebook.presto.hive.parquet.dictionary.ParquetDictionary) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) Domain(com.facebook.presto.spi.predicate.Domain) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with ParquetDictionaryPage

use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.

the class ParquetColumnChunk method readAllPages.

public ParquetPageReader readAllPages() throws IOException {
    List<ParquetDataPage> pages = new ArrayList<>();
    ParquetDictionaryPage dictionaryPage = null;
    long valueCount = 0;
    while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) {
        PageHeader pageHeader = readPageHeader();
        int uncompressedPageSize = pageHeader.getUncompressed_page_size();
        int compressedPageSize = pageHeader.getCompressed_page_size();
        switch(pageHeader.type) {
            case DICTIONARY_PAGE:
                if (dictionaryPage != null) {
                    throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor());
                }
                dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize);
                break;
            case DATA_PAGE:
                valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages);
                break;
            case DATA_PAGE_V2:
                valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages);
                break;
            default:
                skip(compressedPageSize);
                break;
        }
    }
    return new ParquetPageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage);
}
Also used : ParquetCorruptionException(com.facebook.presto.hive.parquet.ParquetCorruptionException) ParquetDictionaryPage(com.facebook.presto.hive.parquet.ParquetDictionaryPage) PageHeader(parquet.format.PageHeader) DictionaryPageHeader(parquet.format.DictionaryPageHeader) DataPageHeader(parquet.format.DataPageHeader) ParquetDataPage(com.facebook.presto.hive.parquet.ParquetDataPage) ArrayList(java.util.ArrayList)

Example 3 with ParquetDictionaryPage

use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.

the class ParquetColumnReader method setPageReader.

public void setPageReader(ParquetPageReader pageReader) {
    this.pageReader = requireNonNull(pageReader, "pageReader");
    ParquetDictionaryPage dictionaryPage = pageReader.readDictionaryPage();
    if (dictionaryPage != null) {
        try {
            dictionary = dictionaryPage.getEncoding().initDictionary(columnDescriptor, dictionaryPage);
        } catch (IOException e) {
            throw new ParquetDecodingException("could not decode the dictionary for " + columnDescriptor, e);
        }
    } else {
        dictionary = null;
    }
    checkArgument(pageReader.getTotalValueCount() > 0, "page is empty");
    totalValueCount = pageReader.getTotalValueCount();
}
Also used : ParquetDecodingException(parquet.io.ParquetDecodingException) ParquetDictionaryPage(com.facebook.presto.hive.parquet.ParquetDictionaryPage) IOException(java.io.IOException)

Example 4 with ParquetDictionaryPage

use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.

the class ParquetPredicateUtils method readDictionaryPage.

private static Optional<ParquetDictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName) {
    try {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        PageHeader pageHeader = Util.readPageHeader(inputStream);
        if (pageHeader.type != PageType.DICTIONARY_PAGE) {
            return Optional.empty();
        }
        Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size());
        DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
        ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
        int dictionarySize = dicHeader.getNum_values();
        return Optional.of(new ParquetDictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
    } catch (IOException ignored) {
        return Optional.empty();
    }
}
Also used : ParquetDictionaryPage(com.facebook.presto.hive.parquet.ParquetDictionaryPage) ByteArrayInputStream(java.io.ByteArrayInputStream) PageHeader(parquet.format.PageHeader) DictionaryPageHeader(parquet.format.DictionaryPageHeader) Slice(io.airlift.slice.Slice) DictionaryPageHeader(parquet.format.DictionaryPageHeader) IOException(java.io.IOException) ParquetEncoding(com.facebook.presto.hive.parquet.ParquetEncoding) ParquetTypeUtils.getParquetEncoding(com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetEncoding)

Example 5 with ParquetDictionaryPage

use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.

the class ParquetPredicateUtils method getDictionariesByColumnOrdinal.

private static Map<Integer, ParquetDictionaryDescriptor> getDictionariesByColumnOrdinal(BlockMetaData blockMetadata, ParquetDataSource dataSource, MessageType requestedSchema, TupleDomain<HiveColumnHandle> effectivePredicate) {
    ImmutableMap.Builder<Integer, ParquetDictionaryDescriptor> dictionaries = ImmutableMap.builder();
    for (int ordinal = 0; ordinal < blockMetadata.getColumns().size(); ordinal++) {
        ColumnChunkMetaData columnChunkMetaData = blockMetadata.getColumns().get(ordinal);
        for (int i = 0; i < requestedSchema.getColumns().size(); i++) {
            ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(i);
            if (isColumnPredicate(columnDescriptor, effectivePredicate) && columnChunkMetaData.getPath().equals(ColumnPath.get(columnDescriptor.getPath())) && isOnlyDictionaryEncodingPages(columnChunkMetaData.getEncodings())) {
                try {
                    int totalSize = toIntExact(columnChunkMetaData.getTotalSize());
                    byte[] buffer = new byte[totalSize];
                    dataSource.readFully(columnChunkMetaData.getStartingPos(), buffer);
                    Optional<ParquetDictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnChunkMetaData.getCodec());
                    dictionaries.put(ordinal, new ParquetDictionaryDescriptor(columnDescriptor, dictionaryPage));
                } catch (IOException ignored) {
                }
                break;
            }
        }
    }
    return dictionaries.build();
}
Also used : ParquetDictionaryPage(com.facebook.presto.hive.parquet.ParquetDictionaryPage) ColumnChunkMetaData(parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(parquet.column.ColumnDescriptor) IOException(java.io.IOException) ImmutableMap(com.google.common.collect.ImmutableMap)

Aggregations

ParquetDictionaryPage (com.facebook.presto.hive.parquet.ParquetDictionaryPage)5 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)2 ColumnDescriptor (parquet.column.ColumnDescriptor)2 DictionaryPageHeader (parquet.format.DictionaryPageHeader)2 PageHeader (parquet.format.PageHeader)2 ParquetCorruptionException (com.facebook.presto.hive.parquet.ParquetCorruptionException)1 ParquetDataPage (com.facebook.presto.hive.parquet.ParquetDataPage)1 ParquetEncoding (com.facebook.presto.hive.parquet.ParquetEncoding)1 ParquetTypeUtils.getParquetEncoding (com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetEncoding)1 ParquetDictionary (com.facebook.presto.hive.parquet.dictionary.ParquetDictionary)1 Domain (com.facebook.presto.spi.predicate.Domain)1 TupleDomain (com.facebook.presto.spi.predicate.TupleDomain)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 Slice (io.airlift.slice.Slice)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 List (java.util.List)1 DataPageHeader (parquet.format.DataPageHeader)1