Search in sources :

Example 1 with ColumnDescriptor

use of parquet.column.ColumnDescriptor in project presto by prestodb.

the class TupleDomainParquetPredicate method getDomain.

@VisibleForTesting
public static Domain getDomain(Type type, ParquetDictionaryDescriptor dictionaryDescriptor) {
    if (dictionaryDescriptor == null) {
        return null;
    }
    ColumnDescriptor columnDescriptor = dictionaryDescriptor.getColumnDescriptor();
    Optional<ParquetDictionaryPage> dictionaryPage = dictionaryDescriptor.getDictionaryPage();
    if (!dictionaryPage.isPresent()) {
        return null;
    }
    ParquetDictionary dictionary;
    try {
        dictionary = dictionaryPage.get().getEncoding().initDictionary(columnDescriptor, dictionaryPage.get());
    } catch (Exception e) {
        // OK to ignore exception when reading dictionaries
        return null;
    }
    int dictionarySize = dictionaryPage.get().getDictionarySize();
    if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT64) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, dictionary.decodeToLong(i)));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    } else if (type.equals(BIGINT) && columnDescriptor.getType() == PrimitiveTypeName.INT32) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, (long) dictionary.decodeToInt(i)));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    } else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.DOUBLE) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, dictionary.decodeToDouble(i)));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    } else if (type.equals(DOUBLE) && columnDescriptor.getType() == PrimitiveTypeName.FLOAT) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, (double) dictionary.decodeToFloat(i)));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    } else if (isVarcharType(type) && columnDescriptor.getType() == PrimitiveTypeName.BINARY) {
        List<Domain> domains = new ArrayList<>();
        for (int i = 0; i < dictionarySize; i++) {
            domains.add(Domain.singleValue(type, Slices.wrappedBuffer(dictionary.decodeToBinary(i).getBytes())));
        }
        domains.add(Domain.onlyNull(type));
        return Domain.union(domains);
    }
    return null;
}
Also used : ParquetDictionaryPage(com.facebook.presto.hive.parquet.ParquetDictionaryPage) ColumnDescriptor(parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) ParquetDictionary(com.facebook.presto.hive.parquet.dictionary.ParquetDictionary) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) Domain(com.facebook.presto.spi.predicate.Domain) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with ColumnDescriptor

use of parquet.column.ColumnDescriptor in project presto by prestodb.

the class ParquetReader method initializeColumnReaders.

private void initializeColumnReaders() {
    for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
        ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
        RichColumnDescriptor column = new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
        columnReadersMap.put(column, ParquetColumnReader.createReader(column));
    }
}
Also used : RichColumnDescriptor(com.facebook.presto.hive.parquet.RichColumnDescriptor) RichColumnDescriptor(com.facebook.presto.hive.parquet.RichColumnDescriptor) ColumnDescriptor(parquet.column.ColumnDescriptor) PrimitiveColumnIO(parquet.io.PrimitiveColumnIO)

Example 3 with ColumnDescriptor

use of parquet.column.ColumnDescriptor in project presto by prestodb.

the class ParquetReader method nextBatch.

public int nextBatch() {
    if (nextRowInGroup >= currentGroupRowCount && !advanceToNextRowGroup()) {
        return -1;
    }
    batchSize = toIntExact(min(MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup));
    nextRowInGroup += batchSize;
    currentPosition += batchSize;
    for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
        ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
        RichColumnDescriptor column = new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel());
        ParquetColumnReader columnReader = columnReadersMap.get(column);
        columnReader.prepareNextRead(batchSize);
    }
    return batchSize;
}
Also used : RichColumnDescriptor(com.facebook.presto.hive.parquet.RichColumnDescriptor) RichColumnDescriptor(com.facebook.presto.hive.parquet.RichColumnDescriptor) ColumnDescriptor(parquet.column.ColumnDescriptor) PrimitiveColumnIO(parquet.io.PrimitiveColumnIO)

Example 4 with ColumnDescriptor

use of parquet.column.ColumnDescriptor in project presto by prestodb.

the class ParquetTester method assertFileContents.

private static void assertFileContents(JobConf jobConf, TempFile tempFile, Iterable<?> expectedValues, Type type) throws IOException, InterruptedException {
    Path path = new Path(tempFile.getFile().toURI());
    FileSystem fileSystem = path.getFileSystem(jobConf);
    ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
    FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    MessageType fileSchema = fileMetaData.getSchema();
    long size = fileSystem.getFileStatus(path).getLen();
    FSDataInputStream inputStream = fileSystem.open(path);
    ParquetDataSource dataSource = new HdfsParquetDataSource(path, size, inputStream);
    TypeManager typeManager = new TypeRegistry();
    ParquetReader parquetReader = new ParquetReader(fileSchema, fileSchema, parquetMetadata.getBlocks(), dataSource, typeManager, new AggregatedMemoryContext());
    assertEquals(parquetReader.getPosition(), 0);
    int rowsProcessed = 0;
    Iterator<?> iterator = expectedValues.iterator();
    for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) {
        ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0);
        Block block = parquetReader.readPrimitive(columnDescriptor, type);
        for (int i = 0; i < batchSize; i++) {
            assertTrue(iterator.hasNext());
            Object expected = iterator.next();
            Object actual = decodeObject(type, block, i);
            assertEquals(actual, expected);
        }
        rowsProcessed += batchSize;
        assertEquals(parquetReader.getPosition(), rowsProcessed);
    }
    assertFalse(iterator.hasNext());
    assertEquals(parquetReader.getPosition(), rowsProcessed);
    parquetReader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(parquet.column.ColumnDescriptor) ParquetReader(com.facebook.presto.hive.parquet.reader.ParquetReader) TypeRegistry(com.facebook.presto.type.TypeRegistry) AggregatedMemoryContext(com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TypeManager(com.facebook.presto.spi.type.TypeManager) Block(com.facebook.presto.spi.block.Block) FileMetaData(parquet.hadoop.metadata.FileMetaData) MessageType(parquet.schema.MessageType)

Example 5 with ColumnDescriptor

use of parquet.column.ColumnDescriptor in project presto by prestodb.

the class ParquetTypeUtils method getDescriptor.

public static Optional<RichColumnDescriptor> getDescriptor(MessageType fileSchema, MessageType requestedSchema, List<String> path) {
    checkArgument(path.size() >= 1, "Parquet nested path should have at least one component");
    int level = path.size();
    for (PrimitiveColumnIO columnIO : getColumns(fileSchema, requestedSchema)) {
        ColumnIO[] fields = columnIO.getPath();
        if (fields.length <= level) {
            continue;
        }
        if (fields[level].getName().equalsIgnoreCase(path.get(level - 1))) {
            boolean match = true;
            for (int i = 0; i < level - 1; i++) {
                if (!fields[i + 1].getName().equalsIgnoreCase(path.get(i))) {
                    match = false;
                }
            }
            if (match) {
                ColumnDescriptor descriptor = columnIO.getColumnDescriptor();
                return Optional.of(new RichColumnDescriptor(descriptor.getPath(), columnIO.getType().asPrimitiveType(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel()));
            }
        }
    }
    return empty();
}
Also used : ColumnDescriptor(parquet.column.ColumnDescriptor) ColumnIO(parquet.io.ColumnIO) PrimitiveColumnIO(parquet.io.PrimitiveColumnIO) PrimitiveColumnIO(parquet.io.PrimitiveColumnIO)

Aggregations

ColumnDescriptor (parquet.column.ColumnDescriptor)6 PrimitiveColumnIO (parquet.io.PrimitiveColumnIO)3 ParquetDictionaryPage (com.facebook.presto.hive.parquet.ParquetDictionaryPage)2 RichColumnDescriptor (com.facebook.presto.hive.parquet.RichColumnDescriptor)2 ParquetDictionary (com.facebook.presto.hive.parquet.dictionary.ParquetDictionary)1 AggregatedMemoryContext (com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext)1 ParquetReader (com.facebook.presto.hive.parquet.reader.ParquetReader)1 Block (com.facebook.presto.spi.block.Block)1 Domain (com.facebook.presto.spi.predicate.Domain)1 TupleDomain (com.facebook.presto.spi.predicate.TupleDomain)1 TypeManager (com.facebook.presto.spi.type.TypeManager)1 TypeRegistry (com.facebook.presto.type.TypeRegistry)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FileSystem (org.apache.hadoop.fs.FileSystem)1