Search in sources :

Example 1 with SchemaElement

use of parquet.format.SchemaElement in project presto by prestodb.

the class ParquetMetadataReader method readTypeSchema.

private static void readTypeSchema(Types.GroupBuilder<?> builder, Iterator<SchemaElement> schemaIterator, int typeCount) {
    for (int i = 0; i < typeCount; i++) {
        SchemaElement element = schemaIterator.next();
        Types.Builder<?, ?> typeBuilder;
        if (element.type == null) {
            typeBuilder = builder.group(Repetition.valueOf(element.repetition_type.name()));
            readTypeSchema((Types.GroupBuilder<?>) typeBuilder, schemaIterator, element.num_children);
        } else {
            Types.PrimitiveBuilder<?> primitiveBuilder = builder.primitive(getTypeName(element.type), Repetition.valueOf(element.repetition_type.name()));
            if (element.isSetType_length()) {
                primitiveBuilder.length(element.type_length);
            }
            if (element.isSetPrecision()) {
                primitiveBuilder.precision(element.precision);
            }
            if (element.isSetScale()) {
                primitiveBuilder.scale(element.scale);
            }
            typeBuilder = primitiveBuilder;
        }
        if (element.isSetConverted_type()) {
            typeBuilder.as(getOriginalType(element.converted_type));
        }
        if (element.isSetField_id()) {
            typeBuilder.id(element.field_id);
        }
        typeBuilder.named(element.name);
    }
}
Also used : Types(parquet.schema.Types) SchemaElement(parquet.format.SchemaElement)

Example 2 with SchemaElement

use of parquet.format.SchemaElement in project presto by prestodb.

the class ParquetMetadataReader method readFooter.

public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException {
    FileStatus fileStatus = fileSystem.getFileStatus(file);
    try (FSDataInputStream inputStream = fileSystem.open(file)) {
        // Parquet File Layout:
        //
        // MAGIC
        // variable: Data
        // variable: Metadata
        // 4 bytes: MetadataLength
        // MAGIC
        long length = fileStatus.getLen();
        validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file);
        long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length;
        inputStream.seek(metadataLengthIndex);
        int metadataLength = readIntLittleEndian(inputStream);
        byte[] magic = new byte[MAGIC.length];
        inputStream.readFully(magic);
        validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic));
        long metadataIndex = metadataLengthIndex - metadataLength;
        validateParquet(metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex);
        inputStream.seek(metadataIndex);
        FileMetaData fileMetaData = readFileMetaData(inputStream);
        List<SchemaElement> schema = fileMetaData.getSchema();
        validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file);
        MessageType messageType = readParquetSchema(schema);
        List<BlockMetaData> blocks = new ArrayList<>();
        List<RowGroup> rowGroups = fileMetaData.getRow_groups();
        if (rowGroups != null) {
            for (RowGroup rowGroup : rowGroups) {
                BlockMetaData blockMetaData = new BlockMetaData();
                blockMetaData.setRowCount(rowGroup.getNum_rows());
                blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
                List<ColumnChunk> columns = rowGroup.getColumns();
                validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
                String filePath = columns.get(0).getFile_path();
                for (ColumnChunk columnChunk : columns) {
                    validateParquet((filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file");
                    ColumnMetaData metaData = columnChunk.meta_data;
                    String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]);
                    ColumnPath columnPath = ColumnPath.get(path);
                    ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
                    blockMetaData.addColumn(column);
                }
                blockMetaData.setPath(filePath);
                blocks.add(blockMetaData);
            }
        }
        Map<String, String> keyValueMetaData = new HashMap<>();
        List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
        if (keyValueList != null) {
            for (KeyValue keyValue : keyValueList) {
                keyValueMetaData.put(keyValue.key, keyValue.value);
            }
        }
        return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
    }
}
Also used : BlockMetaData(parquet.hadoop.metadata.BlockMetaData) FileStatus(org.apache.hadoop.fs.FileStatus) KeyValue(parquet.format.KeyValue) ColumnChunkMetaData(parquet.hadoop.metadata.ColumnChunkMetaData) HashMap(java.util.HashMap) ParquetMetadata(parquet.hadoop.metadata.ParquetMetadata) RowGroup(parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(parquet.format.ColumnChunk) SchemaElement(parquet.format.SchemaElement) ColumnMetaData(parquet.format.ColumnMetaData) FileMetaData(parquet.format.FileMetaData) Util.readFileMetaData(parquet.format.Util.readFileMetaData) MessageType(parquet.schema.MessageType) ColumnPath(parquet.hadoop.metadata.ColumnPath) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 3 with SchemaElement

use of parquet.format.SchemaElement in project presto by prestodb.

the class ParquetMetadataReader method readParquetSchema.

private static MessageType readParquetSchema(List<SchemaElement> schema) {
    Iterator<SchemaElement> schemaIterator = schema.iterator();
    SchemaElement rootSchema = schemaIterator.next();
    Types.MessageTypeBuilder builder = Types.buildMessage();
    readTypeSchema(builder, schemaIterator, rootSchema.getNum_children());
    return builder.named(rootSchema.name);
}
Also used : Types(parquet.schema.Types) SchemaElement(parquet.format.SchemaElement)

Aggregations

SchemaElement (parquet.format.SchemaElement)3 Types (parquet.schema.Types)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 ColumnChunk (parquet.format.ColumnChunk)1 ColumnMetaData (parquet.format.ColumnMetaData)1 FileMetaData (parquet.format.FileMetaData)1 KeyValue (parquet.format.KeyValue)1 RowGroup (parquet.format.RowGroup)1 Util.readFileMetaData (parquet.format.Util.readFileMetaData)1 BlockMetaData (parquet.hadoop.metadata.BlockMetaData)1 ColumnChunkMetaData (parquet.hadoop.metadata.ColumnChunkMetaData)1 ColumnPath (parquet.hadoop.metadata.ColumnPath)1 ParquetMetadata (parquet.hadoop.metadata.ParquetMetadata)1 MessageType (parquet.schema.MessageType)1