Search in sources :

Example 1 with ParquetReader

use of com.facebook.presto.hive.parquet.reader.ParquetReader in project presto by prestodb.

the class ParquetTester method assertFileContents.

private static void assertFileContents(JobConf jobConf, TempFile tempFile, Iterable<?> expectedValues, Type type) throws IOException, InterruptedException {
    Path path = new Path(tempFile.getFile().toURI());
    FileSystem fileSystem = path.getFileSystem(jobConf);
    ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
    FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    MessageType fileSchema = fileMetaData.getSchema();
    long size = fileSystem.getFileStatus(path).getLen();
    FSDataInputStream inputStream = fileSystem.open(path);
    ParquetDataSource dataSource = new HdfsParquetDataSource(path, size, inputStream);
    TypeManager typeManager = new TypeRegistry();
    ParquetReader parquetReader = new ParquetReader(fileSchema, fileSchema, parquetMetadata.getBlocks(), dataSource, typeManager, new AggregatedMemoryContext());
    assertEquals(parquetReader.getPosition(), 0);
    int rowsProcessed = 0;
    Iterator<?> iterator = expectedValues.iterator();
    for (int batchSize = parquetReader.nextBatch(); batchSize >= 0; batchSize = parquetReader.nextBatch()) {
        ColumnDescriptor columnDescriptor = fileSchema.getColumns().get(0);
        Block block = parquetReader.readPrimitive(columnDescriptor, type);
        for (int i = 0; i < batchSize; i++) {
            assertTrue(iterator.hasNext());
            Object expected = iterator.next();
            Object actual = decodeObject(type, block, i);
            assertEquals(actual, expected);
        }
        rowsProcessed += batchSize;
        assertEquals(parquetReader.getPosition(), rowsProcessed);
    }
    assertFalse(iterator.hasNext());
    assertEquals(parquetReader.getPosition(), rowsProcessed);
    parquetReader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(parquet.column.ColumnDescriptor) ParquetReader(com.facebook.presto.hive.parquet.reader.ParquetReader) TypeRegistry(com.facebook.presto.type.TypeRegistry) AggregatedMemoryContext(com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) TypeManager(com.facebook.presto.spi.type.TypeManager) Block(com.facebook.presto.spi.block.Block) FileMetaData(parquet.hadoop.metadata.FileMetaData) MessageType(parquet.schema.MessageType)

Example 2 with ParquetReader

use of com.facebook.presto.hive.parquet.reader.ParquetReader in project presto by prestodb.

the class ParquetPageSourceFactory method createParquetPageSource.

public static ParquetPageSource createParquetPageSource(HdfsEnvironment hdfsEnvironment, String user, Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns, boolean useParquetColumnNames, TypeManager typeManager, boolean predicatePushdownEnabled, TupleDomain<HiveColumnHandle> effectivePredicate) {
    AggregatedMemoryContext systemMemoryContext = new AggregatedMemoryContext();
    ParquetDataSource dataSource = null;
    try {
        FileSystem fileSystem = hdfsEnvironment.getFileSystem(user, path, configuration);
        dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length);
        ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path);
        FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        MessageType fileSchema = fileMetaData.getSchema();
        List<parquet.schema.Type> fields = columns.stream().filter(column -> column.getColumnType() == REGULAR).map(column -> getParquetType(column, fileSchema, useParquetColumnNames)).filter(Objects::nonNull).collect(toList());
        MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
        List<BlockMetaData> blocks = new ArrayList<>();
        for (BlockMetaData block : parquetMetadata.getBlocks()) {
            long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= start && firstDataPage < start + length) {
                blocks.add(block);
            }
        }
        if (predicatePushdownEnabled) {
            ParquetPredicate parquetPredicate = buildParquetPredicate(columns, effectivePredicate, fileMetaData.getSchema(), typeManager);
            final ParquetDataSource finalDataSource = dataSource;
            blocks = blocks.stream().filter(block -> predicateMatches(parquetPredicate, block, finalDataSource, requestedSchema, effectivePredicate)).collect(toList());
        }
        ParquetReader parquetReader = new ParquetReader(fileSchema, requestedSchema, blocks, dataSource, typeManager, systemMemoryContext);
        return new ParquetPageSource(parquetReader, dataSource, fileSchema, requestedSchema, length, schema, columns, effectivePredicate, typeManager, useParquetColumnNames, systemMemoryContext);
    } catch (Exception e) {
        try {
            if (dataSource != null) {
                dataSource.close();
            }
        } catch (IOException ignored) {
        }
        if (e instanceof PrestoException) {
            throw (PrestoException) e;
        }
        String message = format("Error opening Hive split %s (offset=%s, length=%s): %s", path, start, length, e.getMessage());
        if (e.getClass().getSimpleName().equals("BlockMissingException")) {
            throw new PrestoException(HIVE_MISSING_DATA, message, e);
        }
        throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, message, e);
    }
}
Also used : HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) HdfsEnvironment(com.facebook.presto.hive.HdfsEnvironment) DateTimeZone(org.joda.time.DateTimeZone) ParquetTypeUtils.getParquetType(com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetType) TypeManager(com.facebook.presto.spi.type.TypeManager) FileSystem(org.apache.hadoop.fs.FileSystem) REGULAR(com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR) PrestoException(com.facebook.presto.spi.PrestoException) HIVE_CANNOT_OPEN_SPLIT(com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT) HIVE_MISSING_DATA(com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA) HiveSessionProperties.isParquetOptimizedReaderEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetOptimizedReaderEnabled) ArrayList(java.util.ArrayList) ParquetReader(com.facebook.presto.hive.parquet.reader.ParquetReader) Inject(javax.inject.Inject) ParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicate) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ParquetMetadataReader(com.facebook.presto.hive.parquet.reader.ParquetMetadataReader) HiveClientConfig(com.facebook.presto.hive.HiveClientConfig) HiveSessionProperties.isParquetPredicatePushdownEnabled(com.facebook.presto.hive.HiveSessionProperties.isParquetPredicatePushdownEnabled) ImmutableSet(com.google.common.collect.ImmutableSet) MessageType(parquet.schema.MessageType) Properties(java.util.Properties) ParquetPredicateUtils.predicateMatches(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.predicateMatches) Set(java.util.Set) IOException(java.io.IOException) AggregatedMemoryContext(com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext) FileMetaData(parquet.hadoop.metadata.FileMetaData) BlockMetaData(parquet.hadoop.metadata.BlockMetaData) HivePageSourceFactory(com.facebook.presto.hive.HivePageSourceFactory) String.format(java.lang.String.format) ConnectorSession(com.facebook.presto.spi.ConnectorSession) Objects(java.util.Objects) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) ConnectorPageSource(com.facebook.presto.spi.ConnectorPageSource) ParquetPredicateUtils.buildParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.buildParquetPredicate) Optional(java.util.Optional) HiveColumnHandle(com.facebook.presto.hive.HiveColumnHandle) HiveUtil.getDeserializerClassName(com.facebook.presto.hive.HiveUtil.getDeserializerClassName) HdfsParquetDataSource.buildHdfsParquetDataSource(com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource) BlockMetaData(parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) ParquetReader(com.facebook.presto.hive.parquet.reader.ParquetReader) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) AggregatedMemoryContext(com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext) PrestoException(com.facebook.presto.spi.PrestoException) IOException(java.io.IOException) ParquetTypeUtils.getParquetType(com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetType) MessageType(parquet.schema.MessageType) FileSystem(org.apache.hadoop.fs.FileSystem) FileMetaData(parquet.hadoop.metadata.FileMetaData) MessageType(parquet.schema.MessageType) ParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicate) ParquetPredicateUtils.buildParquetPredicate(com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.buildParquetPredicate)

Aggregations

AggregatedMemoryContext (com.facebook.presto.hive.parquet.memory.AggregatedMemoryContext)2 ParquetReader (com.facebook.presto.hive.parquet.reader.ParquetReader)2 TypeManager (com.facebook.presto.spi.type.TypeManager)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 HdfsEnvironment (com.facebook.presto.hive.HdfsEnvironment)1 HiveClientConfig (com.facebook.presto.hive.HiveClientConfig)1 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)1 REGULAR (com.facebook.presto.hive.HiveColumnHandle.ColumnType.REGULAR)1 HIVE_CANNOT_OPEN_SPLIT (com.facebook.presto.hive.HiveErrorCode.HIVE_CANNOT_OPEN_SPLIT)1 HIVE_MISSING_DATA (com.facebook.presto.hive.HiveErrorCode.HIVE_MISSING_DATA)1 HivePageSourceFactory (com.facebook.presto.hive.HivePageSourceFactory)1 HiveSessionProperties.isParquetOptimizedReaderEnabled (com.facebook.presto.hive.HiveSessionProperties.isParquetOptimizedReaderEnabled)1 HiveSessionProperties.isParquetPredicatePushdownEnabled (com.facebook.presto.hive.HiveSessionProperties.isParquetPredicatePushdownEnabled)1 HiveUtil.getDeserializerClassName (com.facebook.presto.hive.HiveUtil.getDeserializerClassName)1 HdfsParquetDataSource.buildHdfsParquetDataSource (com.facebook.presto.hive.parquet.HdfsParquetDataSource.buildHdfsParquetDataSource)1 ParquetTypeUtils.getParquetType (com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetType)1 ParquetPredicate (com.facebook.presto.hive.parquet.predicate.ParquetPredicate)1 ParquetPredicateUtils.buildParquetPredicate (com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.buildParquetPredicate)1 ParquetPredicateUtils.predicateMatches (com.facebook.presto.hive.parquet.predicate.ParquetPredicateUtils.predicateMatches)1