Search in sources :

Example 1 with ColumnChunkMetaData

use of parquet.hadoop.metadata.ColumnChunkMetaData in project presto by prestodb.

the class ParquetMetadataReader method readFooter.

public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException {
    FileStatus fileStatus = fileSystem.getFileStatus(file);
    try (FSDataInputStream inputStream = fileSystem.open(file)) {
        // Parquet File Layout:
        //
        // MAGIC
        // variable: Data
        // variable: Metadata
        // 4 bytes: MetadataLength
        // MAGIC
        long length = fileStatus.getLen();
        validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
        long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;
        inputStream.seek(metadataLengthIndex);
        int metadataLength = readIntLittleEndian(inputStream);
        byte[] magic = new byte[MAGIC.length];
        inputStream.readFully(magic);
        validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));
        long metadataIndex = metadataLengthIndex - metadataLength;
        validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex);
        inputStream.seek(metadataIndex);
        FileMetaData fileMetaData = readFileMetaData(inputStream);
        List<SchemaElement> schema = fileMetaData.getSchema();
        validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);
        MessageType messageType = readParquetSchema(schema);
        List<BlockMetaData> blocks = new ArrayList<>();
        List<RowGroup> rowGroups = fileMetaData.getRow_groups();
        if (rowGroups != null) {
            for (RowGroup rowGroup : rowGroups) {
                BlockMetaData blockMetaData = new BlockMetaData();
                blockMetaData.setRowCount(rowGroup.getNum_rows());
                blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
                List<ColumnChunk> columns = rowGroup.getColumns();
                validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
                String filePath = columns.get(0).getFile_path();
                for (ColumnChunk columnChunk : columns) {
                    validateParquet((filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file");
                    ColumnMetaData metaData = columnChunk.meta_data;
                    String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
                    ColumnPath columnPath = ColumnPath.get(path);
                    ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
                    blockMetaData.addColumn(column);
                }
                blockMetaData.setPath(filePath);
                blocks.add(blockMetaData);
            }
        }
        Map<String, String> keyValueMetaData = new HashMap<>();
        List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
        if (keyValueList != null) {
            for (KeyValue keyValue : keyValueList) {
                keyValueMetaData.put(keyValue.key, keyValue.value);
            }
        }
        return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
    }
}
Also used : BlockMetaData(parquet.hadoop.metadata.BlockMetaData) FileStatus(org.apache.hadoop.fs.FileStatus) KeyValue(parquet.format.KeyValue) ColumnChunkMetaData(parquet.hadoop.metadata.ColumnChunkMetaData) HashMap(java.util.HashMap) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) RowGroup(parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(parquet.format.ColumnChunk) SchemaElement(parquet.format.SchemaElement) ColumnMetaData(parquet.format.ColumnMetaData) FileMetaData(parquet.format.FileMetaData) Util.readFileMetaData(parquet.format.Util.readFileMetaData) MessageType(parquet.schema.MessageType) ColumnPath(parquet.hadoop.metadata.ColumnPath) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 2 with ColumnChunkMetaData

use of parquet.hadoop.metadata.ColumnChunkMetaData in project presto by prestodb.

the class ParquetReader method readPrimitive.

private Block readPrimitive(ColumnDescriptor columnDescriptor, Type type, IntList offsets) throws IOException {
    ParquetColumnReader columnReader = columnReadersMap.get(columnDescriptor);
    if (columnReader.getPageReader() == null) {
        validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
        ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
        long startingPosition = metadata.getStartingPos();
        int totalSize = toIntExact(metadata.getTotalSize());
        byte[] buffer = allocateBlock(totalSize);
        dataSource.readFully(startingPosition, buffer);
        ParquetColumnChunkDescriptor descriptor = new ParquetColumnChunkDescriptor(columnDescriptor, metadata, totalSize);
        ParquetColumnChunk columnChunk = new ParquetColumnChunk(descriptor, buffer, 0);
        columnReader.setPageReader(columnChunk.readAllPages());
    }
    return columnReader.readPrimitive(type, offsets);
}
Also used : ColumnChunkMetaData(parquet.hadoop.metadata.ColumnChunkMetaData)

Example 3 with ColumnChunkMetaData

use of parquet.hadoop.metadata.ColumnChunkMetaData in project presto by prestodb.

the class ParquetPredicateUtils method getDictionariesByColumnOrdinal.

private static Map<Integer, ParquetDictionaryDescriptor> getDictionariesByColumnOrdinal(BlockMetaData blockMetadata, ParquetDataSource dataSource, MessageType requestedSchema, TupleDomain<HiveColumnHandle> effectivePredicate) {
    ImmutableMap.Builder<Integer, ParquetDictionaryDescriptor> dictionaries = ImmutableMap.builder();
    for (int ordinal = 0; ordinal < blockMetadata.getColumns().size(); ordinal++) {
        ColumnChunkMetaData columnChunkMetaData = blockMetadata.getColumns().get(ordinal);
        for (int i = 0; i < requestedSchema.getColumns().size(); i++) {
            ColumnDescriptor columnDescriptor = requestedSchema.getColumns().get(i);
            if (isColumnPredicate(columnDescriptor, effectivePredicate) && columnChunkMetaData.getPath().equals(ColumnPath.get(columnDescriptor.getPath())) && isOnlyDictionaryEncodingPages(columnChunkMetaData.getEncodings())) {
                try {
                    int totalSize = toIntExact(columnChunkMetaData.getTotalSize());
                    byte[] buffer = new byte[totalSize];
                    dataSource.readFully(columnChunkMetaData.getStartingPos(), buffer);
                    Optional<ParquetDictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnChunkMetaData.getCodec());
                    dictionaries.put(ordinal, new ParquetDictionaryDescriptor(columnDescriptor, dictionaryPage));
                } catch (IOException ignored) {
                }
                break;
            }
        }
    }
    return dictionaries.build();
}
Also used : ParquetDictionaryPage(com.facebook.presto.hive.parquet.ParquetDictionaryPage) ColumnChunkMetaData(parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(parquet.column.ColumnDescriptor) IOException(java.io.IOException) ImmutableMap(com.google.common.collect.ImmutableMap)

Aggregations

ColumnChunkMetaData (parquet.hadoop.metadata.ColumnChunkMetaData)3 ParquetDictionaryPage (com.facebook.presto.hive.parquet.ParquetDictionaryPage)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 ColumnDescriptor (parquet.column.ColumnDescriptor)1 ColumnChunk (parquet.format.ColumnChunk)1 ColumnMetaData (parquet.format.ColumnMetaData)1 FileMetaData (parquet.format.FileMetaData)1 KeyValue (parquet.format.KeyValue)1 RowGroup (parquet.format.RowGroup)1 SchemaElement (parquet.format.SchemaElement)1 Util.readFileMetaData (parquet.format.Util.readFileMetaData)1 BlockMetaData (parquet.hadoop.metadata.BlockMetaData)1 ColumnPath (parquet.hadoop.metadata.ColumnPath)1 ParquetMetadata (parquet.hadoop.metadata.ParquetMetadata)1 MessageType (parquet.schema.MessageType)1