Search in sources :

Example 1 with ColumnChunk

use of parquet.format.ColumnChunk in project presto by prestodb.

the class ParquetMetadataReader method readFooter.

public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException {
    FileStatus fileStatus = fileSystem.getFileStatus(file);
    try (FSDataInputStream inputStream = fileSystem.open(file)) {
        // Parquet File Layout:
        //
        // MAGIC
        // variable: Data
        // variable: Metadata
        // 4 bytes: MetadataLength
        // MAGIC
        long length = fileStatus.getLen();
        validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
        long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;
        inputStream.seek(metadataLengthIndex);
        int metadataLength = readIntLittleEndian(inputStream);
        byte[] magic = new byte[MAGIC.length];
        inputStream.readFully(magic);
        validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));
        long metadataIndex = metadataLengthIndex - metadataLength;
        validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex);
        inputStream.seek(metadataIndex);
        FileMetaData fileMetaData = readFileMetaData(inputStream);
        List<SchemaElement> schema = fileMetaData.getSchema();
        validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);
        MessageType messageType = readParquetSchema(schema);
        List<BlockMetaData> blocks = new ArrayList<>();
        List<RowGroup> rowGroups = fileMetaData.getRow_groups();
        if (rowGroups != null) {
            for (RowGroup rowGroup : rowGroups) {
                BlockMetaData blockMetaData = new BlockMetaData();
                blockMetaData.setRowCount(rowGroup.getNum_rows());
                blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
                List<ColumnChunk> columns = rowGroup.getColumns();
                validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
                String filePath = columns.get(0).getFile_path();
                for (ColumnChunk columnChunk : columns) {
                    validateParquet((filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file");
                    ColumnMetaData metaData = columnChunk.meta_data;
                    String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
                    ColumnPath columnPath = ColumnPath.get(path);
                    ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
                    blockMetaData.addColumn(column);
                }
                blockMetaData.setPath(filePath);
                blocks.add(blockMetaData);
            }
        }
        Map<String, String> keyValueMetaData = new HashMap<>();
        List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
        if (keyValueList != null) {
            for (KeyValue keyValue : keyValueList) {
                keyValueMetaData.put(keyValue.key, keyValue.value);
            }
        }
        return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
    }
}
Also used : BlockMetaData(parquet.hadoop.metadata.BlockMetaData) FileStatus(org.apache.hadoop.fs.FileStatus) KeyValue(parquet.format.KeyValue) ColumnChunkMetaData(parquet.hadoop.metadata.ColumnChunkMetaData) HashMap(java.util.HashMap) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) RowGroup(parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(parquet.format.ColumnChunk) SchemaElement(parquet.format.SchemaElement) ColumnMetaData(parquet.format.ColumnMetaData) FileMetaData(parquet.format.FileMetaData) Util.readFileMetaData(parquet.format.Util.readFileMetaData) MessageType(parquet.schema.MessageType) ColumnPath(parquet.hadoop.metadata.ColumnPath) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Aggregations

ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 ColumnChunk (parquet.format.ColumnChunk)1 ColumnMetaData (parquet.format.ColumnMetaData)1 FileMetaData (parquet.format.FileMetaData)1 KeyValue (parquet.format.KeyValue)1 RowGroup (parquet.format.RowGroup)1 SchemaElement (parquet.format.SchemaElement)1 Util.readFileMetaData (parquet.format.Util.readFileMetaData)1 BlockMetaData (parquet.hadoop.metadata.BlockMetaData)1 ColumnChunkMetaData (parquet.hadoop.metadata.ColumnChunkMetaData)1 ColumnPath (parquet.hadoop.metadata.ColumnPath)1 ParquetMetadata (parquet.hadoop.metadata.ParquetMetadata)1 MessageType (parquet.schema.MessageType)1