use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.
the class TupleDomainParquetPredicate method getDomain.
@VisibleForTesting
public static Domain getDomain(Type type, ParquetDictionaryDescriptor dictionaryDescriptor) {
if (dictionaryDescriptor == null) {
return null;
}
ColumnDescriptor columnDescriptor = dictionaryDescriptor.getColumnDescriptor();
Optional<ParquetDictionaryPage> dictionaryPage = dictionaryDescriptor.getDictionaryPage();
if (!dictionaryPage.isPresent()) {
return null;
}
ParquetDictionary dictionary;
try {
dictionary = dictionaryPage.get().getEncoding().initDictionary(columnDescriptor, dictionaryPage.get());
} catch (Exception e) {
// OK to ignore exception when reading dictionaries
return null;
}
int dictionarySize = dictionaryPage.get().getDictionarySize();
if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT64) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, dictionary.decodeToLong(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
} else if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT32) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, (long) dictionary.decodeToInt(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
} else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.DOUBLE) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, dictionary.decodeToDouble(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
} else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.FLOAT) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, (double) dictionary.decodeToFloat(i)));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
} else if (isVarcharType(type) && columnDescriptor.getType() == PrimitiveTypeName.BINARY) {
List<Domain> domains = new ArrayList<>();
for (int i = 0; i < dictionarySize; i++) {
domains.add(Domain.singleValue(type, Slices.wrappedBuffer(dictionary.decodeToBinary(i).getBytes())));
}
domains.add(Domain.onlyNull(type));
return Domain.union(domains);
}
return null;
}
use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.
the class ParquetColumnChunk method readAllPages.
public ParquetPageReader readAllPages() throws IOException {
List<ParquetDataPage> pages = new ArrayList<>();
ParquetDictionaryPage dictionaryPage = null;
long valueCount = 0;
while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) {
PageHeader pageHeader = readPageHeader();
int uncompressedPageSize = pageHeader.getUncompressed_page_size();
int compressedPageSize = pageHeader.getCompressed_page_size();
switch(pageHeader.type) {
case DICTIONARY_PAGE:
if (dictionaryPage != null) {
throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor());
}
dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize);
break;
case DATA_PAGE:
valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages);
break;
case DATA_PAGE_V2:
valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages);
break;
default:
skip(compressedPageSize);
break;
}
}
return new ParquetPageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage);
}
use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.
the class ParquetColumnReader method setPageReader.
public void setPageReader(ParquetPageReader pageReader) {
this.pageReader = requireNonNull(pageReader, "pageReader");
ParquetDictionaryPage dictionaryPage = pageReader.readDictionaryPage();
if (dictionaryPage != null) {
try {
dictionary = dictionaryPage.getEncoding().initDictionary(columnDescriptor, dictionaryPage);
} catch (IOException e) {
throw new ParquetDecodingException("could not decode the dictionary for " + columnDescriptor, e);
}
} else {
dictionary = null;
}
checkArgument(pageReader.getTotalValueCount() > 0, "page is empty");
totalValueCount = pageReader.getTotalValueCount();
}
use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.
the class ParquetPredicateUtils method readDictionaryPage.
private static Optional<ParquetDictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName) {
try {
ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
PageHeader pageHeader = Util.readPageHeader(inputStream);
if (pageHeader.type != PageType.DICTIONARY_PAGE) {
return Optional.empty();
}
Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size());
DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
int dictionarySize = dicHeader.getNum_values();
return Optional.of(new ParquetDictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
} catch (IOException ignored) {
return Optional.empty();
}
}
use of com.facebook.presto.hive.parquet.ParquetDictionaryPage in project presto by prestodb.
the class ParquetPredicateUtils method getDictionariesByColumnOrdinal.
private static Map<Integer, ParquetDictionaryDescriptor> getDictionariesByColumnOrdinal(BlockMetaData blockMetadata, ParquetDataSource dataSource, MessageType requestedSchema, TupleDomain<HiveColumnHandle> effectivePredicate) {
ImmutableMap.Builder<Integer, ParquetDictionaryDescriptor> dictionaries = ImmutableMap.builder();
for (int ordinal = 0; ordinal < blockMetadata.getColumns().size(); ordinal++) {
ColumnChunkMetaData columnChunkMetaData = blockMetadata.getColumns().get(ordinal);
for (int i = 0; i < requestedSchema.getColumns().size(); i++) {
ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(i);
if (isColumnPredicate(columnDescriptor, effectivePredicate) && columnChunkMetaData.getPath().equals(ColumnPath.get(columnDescriptor.getPath())) && isOnlyDictionaryEncodingPages(columnChunkMetaData.getEncodings())) {
try {
int totalSize = toIntExact(columnChunkMetaData.getTotalSize());
byte[] buffer = new byte[totalSize];
dataSource.readFully(columnChunkMetaData.getStartingPos(), buffer);
Optional<ParquetDictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnChunkMetaData.getCodec());
dictionaries.put(ordinal, new ParquetDictionaryDescriptor(columnDescriptor, dictionaryPage));
} catch (IOException ignored) {
}
break;
}
}
}
return dictionaries.build();
}
Aggregations