Search in sources :

Example 6 with ColumnChunk

use of org.apache.parquet.format.ColumnChunk in project presto by prestodb.

the class MetadataReader method readFooter.

public static ParquetFileMetadata readFooter(ParquetDataSource parquetDataSource, long fileSize) throws IOException {
    // Parquet File Layout:
    // 
    // MAGIC
    // variable: Data
    // variable: Metadata
    // 4 bytes: MetadataLength
    // MAGIC
    validateParquet(fileSize >= MAGIC.length() + POST_SCRIPT_SIZE, "%s is not a valid Parquet File", parquetDataSource.getId());
    // EXPECTED_FOOTER_SIZE is an int, so this will never fail
    byte[] buffer = new byte[toIntExact(min(fileSize, EXPECTED_FOOTER_SIZE))];
    parquetDataSource.readFully(fileSize - buffer.length, buffer);
    Slice tailSlice = wrappedBuffer(buffer);
    Slice magic = tailSlice.slice(tailSlice.length() - MAGIC.length(), MAGIC.length());
    if (!MAGIC.equals(magic)) {
        throw new ParquetCorruptionException(format("Not valid Parquet file: %s expected magic number: %s got: %s", parquetDataSource.getId(), Arrays.toString(MAGIC.getBytes()), Arrays.toString(magic.getBytes())));
    }
    int metadataLength = tailSlice.getInt(tailSlice.length() - POST_SCRIPT_SIZE);
    int completeFooterSize = metadataLength + POST_SCRIPT_SIZE;
    long metadataFileOffset = fileSize - completeFooterSize;
    validateParquet(metadataFileOffset >= MAGIC.length() && metadataFileOffset + POST_SCRIPT_SIZE < fileSize, "Corrupted Parquet file: %s metadata index: %s out of range", parquetDataSource.getId(), metadataFileOffset);
    // Ensure the slice covers the entire metadata range
    if (tailSlice.length() < completeFooterSize) {
        byte[] footerBuffer = new byte[completeFooterSize];
        parquetDataSource.readFully(metadataFileOffset, footerBuffer, 0, footerBuffer.length - tailSlice.length());
        // Copy the previous slice contents into the new buffer
        tailSlice.getBytes(0, footerBuffer, footerBuffer.length - tailSlice.length(), tailSlice.length());
        tailSlice = wrappedBuffer(footerBuffer, 0, footerBuffer.length);
    }
    FileMetaData fileMetaData = readFileMetaData(tailSlice.slice(tailSlice.length() - completeFooterSize, metadataLength).getInput());
    List<SchemaElement> schema = fileMetaData.getSchema();
    validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", parquetDataSource.getId());
    MessageType messageType = readParquetSchema(schema);
    List<BlockMetaData> blocks = new ArrayList<>();
    List<RowGroup> rowGroups = fileMetaData.getRow_groups();
    if (rowGroups != null) {
        for (RowGroup rowGroup : rowGroups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                validateParquet((filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file");
                ColumnMetaData metaData = columnChunk.meta_data;
                String[] path = metaData.path_in_schema.stream().map(value -> value.toLowerCase(Locale.ENGLISH)).toArray(String[]::new);
                ColumnPath columnPath = ColumnPath.get(path);
                PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
                PrimitiveTypeName primitiveTypeName = primitiveType.getPrimitiveTypeName();
                ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, primitiveType, CompressionCodecName.fromParquet(metaData.codec), PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats), readEncodings(metaData.encodings), readStats(metaData.statistics, primitiveTypeName), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
                column.setColumnIndexReference(toColumnIndexReference(columnChunk));
                column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }
    Map<String, String> keyValueMetaData = new HashMap<>();
    List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
    if (keyValueList != null) {
        for (KeyValue keyValue : keyValueList) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    ParquetMetadata parquetMetadata = new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
    return new ParquetFileMetadata(parquetMetadata, toIntExact(metadataLength));
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Arrays(java.util.Arrays) Slice(io.airlift.slice.Slice) Util.readFileMetaData(org.apache.parquet.format.Util.readFileMetaData) ConvertedType(org.apache.parquet.format.ConvertedType) Repetition(org.apache.parquet.schema.Type.Repetition) HashMap(java.util.HashMap) FileMetaData(org.apache.parquet.format.FileMetaData) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) ArrayList(java.util.ArrayList) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) HashSet(java.util.HashSet) Slices.wrappedBuffer(io.airlift.slice.Slices.wrappedBuffer) KeyValue(org.apache.parquet.format.KeyValue) Locale(java.util.Locale) SchemaElement(org.apache.parquet.format.SchemaElement) Map(java.util.Map) Type(org.apache.parquet.format.Type) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) Math.toIntExact(java.lang.Math.toIntExact) OriginalType(org.apache.parquet.schema.OriginalType) ParquetDataSource(com.facebook.presto.parquet.ParquetDataSource) Types(org.apache.parquet.schema.Types) Iterator(java.util.Iterator) Encoding(org.apache.parquet.format.Encoding) Set(java.util.Set) Statistics(org.apache.parquet.format.Statistics) IOException(java.io.IOException) Math.min(java.lang.Math.min) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) String.format(java.lang.String.format) ColumnChunk(org.apache.parquet.format.ColumnChunk) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) US_ASCII(java.nio.charset.StandardCharsets.US_ASCII) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) RowGroup(org.apache.parquet.format.RowGroup) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Collections(java.util.Collections) ParquetValidationUtils.validateParquet(com.facebook.presto.parquet.ParquetValidationUtils.validateParquet) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) KeyValue(org.apache.parquet.format.KeyValue) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(org.apache.parquet.format.ColumnChunk) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) SchemaElement(org.apache.parquet.format.SchemaElement) PrimitiveType(org.apache.parquet.schema.PrimitiveType) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) Util.readFileMetaData(org.apache.parquet.format.Util.readFileMetaData) FileMetaData(org.apache.parquet.format.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) Slice(io.airlift.slice.Slice)

Example 7 with ColumnChunk

use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.

the class ParquetMetadataConverter method addRowGroup.

private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block, InternalFileEncryptor fileEncryptor) {
    // rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    int rowGroupOrdinal = rowGroups.size();
    int columnOrdinal = -1;
    ByteArrayOutputStream tempOutStream = null;
    for (ColumnChunkMetaData columnMetaData : columns) {
        // verify this is the right offset
        ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset());
        // they are in the same file for now
        columnChunk.file_path = block.getPath();
        InternalColumnEncryptionSetup columnSetup = null;
        boolean writeCryptoMetadata = false;
        boolean encryptMetaData = false;
        ColumnPath path = columnMetaData.getPath();
        if (null != fileEncryptor) {
            columnOrdinal++;
            columnSetup = fileEncryptor.getColumnSetup(path, false, columnOrdinal);
            writeCryptoMetadata = columnSetup.isEncrypted();
            encryptMetaData = fileEncryptor.encryptColumnMetaData(columnSetup);
        }
        ColumnMetaData metaData = new ColumnMetaData(getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), toFormatCodec(columnMetaData.getCodec()), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset());
        if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
            metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
        }
        long bloomFilterOffset = columnMetaData.getBloomFilterOffset();
        if (bloomFilterOffset >= 0) {
            metaData.setBloom_filter_offset(bloomFilterOffset);
        }
        if (columnMetaData.getStatistics() != null && !columnMetaData.getStatistics().isEmpty()) {
            metaData.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
        }
        if (columnMetaData.getEncodingStats() != null) {
            metaData.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
        }
        if (!encryptMetaData) {
            columnChunk.setMeta_data(metaData);
        } else {
            // Serialize and encrypt ColumnMetadata separately
            byte[] columnMetaDataAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.ColumnMetaData, rowGroupOrdinal, columnSetup.getOrdinal(), -1);
            if (null == tempOutStream) {
                tempOutStream = new ByteArrayOutputStream();
            } else {
                tempOutStream.reset();
            }
            try {
                writeColumnMetaData(metaData, tempOutStream, columnSetup.getMetaDataEncryptor(), columnMetaDataAAD);
            } catch (IOException e) {
                throw new ParquetCryptoRuntimeException("Failed to serialize and encrypt ColumnMetadata for " + columnMetaData.getPath(), e);
            }
            columnChunk.setEncrypted_column_metadata(tempOutStream.toByteArray());
            // Keep redacted metadata version for old readers
            if (!fileEncryptor.isFooterEncrypted()) {
                ColumnMetaData metaDataRedacted = metaData.deepCopy();
                if (metaDataRedacted.isSetStatistics())
                    metaDataRedacted.unsetStatistics();
                if (metaDataRedacted.isSetEncoding_stats())
                    metaDataRedacted.unsetEncoding_stats();
                columnChunk.setMeta_data(metaDataRedacted);
            }
        }
        if (writeCryptoMetadata) {
            columnChunk.setCrypto_metadata(columnSetup.getColumnCryptoMetaData());
        }
        // columnChunk.meta_data.index_page_offset = ;
        // columnChunk.meta_data.key_value_metadata = ; // nothing yet
        IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
        if (columnIndexRef != null) {
            columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
            columnChunk.setColumn_index_length(columnIndexRef.getLength());
        }
        IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
        if (offsetIndexRef != null) {
            columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
            columnChunk.setOffset_index_length(offsetIndexRef.getLength());
        }
        parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroup.setFile_offset(block.getStartingPos());
    rowGroup.setTotal_compressed_size(block.getCompressedSize());
    rowGroup.setOrdinal((short) rowGroupOrdinal);
    rowGroups.add(rowGroup);
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ParquetCryptoRuntimeException(org.apache.parquet.crypto.ParquetCryptoRuntimeException) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) IOException(java.io.IOException) ColumnChunk(org.apache.parquet.format.ColumnChunk) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) Util.writeColumnMetaData(org.apache.parquet.format.Util.writeColumnMetaData) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference)

Example 8 with ColumnChunk

use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.

the class ParquetMetadataConverter method filterFileMetaDataByMidpoint.

// Visible for testing
static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) {
    List<RowGroup> rowGroups = metaData.getRow_groups();
    List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
    long preStartIndex = 0;
    long preCompressedSize = 0;
    boolean firstColumnWithMetadata = true;
    if (rowGroups != null && rowGroups.size() > 0) {
        firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data();
    }
    for (RowGroup rowGroup : rowGroups) {
        long totalSize = 0;
        long startIndex;
        ColumnChunk columnChunk = rowGroup.getColumns().get(0);
        if (firstColumnWithMetadata) {
            startIndex = getOffset(columnChunk);
        } else {
            assert rowGroup.isSetFile_offset();
            assert rowGroup.isSetTotal_compressed_size();
            // the file_offset of first block always holds the truth, while other blocks don't :
            // see PARQUET-2078 for details
            startIndex = rowGroup.getFile_offset();
            if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) {
                // first row group's offset is always 4
                if (preStartIndex == 0) {
                    startIndex = 4;
                } else {
                    // use minStartIndex(imprecise in case of padding, but good enough for filtering)
                    startIndex = preStartIndex + preCompressedSize;
                }
            }
            preStartIndex = startIndex;
            preCompressedSize = rowGroup.getTotal_compressed_size();
        }
        if (rowGroup.isSetTotal_compressed_size()) {
            totalSize = rowGroup.getTotal_compressed_size();
        } else {
            for (ColumnChunk col : rowGroup.getColumns()) {
                totalSize += col.getMeta_data().getTotal_compressed_size();
            }
        }
        long midPoint = startIndex + totalSize / 2;
        if (filter.contains(midPoint)) {
            newRowGroups.add(rowGroup);
        }
    }
    metaData.setRow_groups(newRowGroups);
    return metaData;
}
Also used : RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(org.apache.parquet.format.ColumnChunk)

Aggregations

ArrayList (java.util.ArrayList)8 ColumnChunk (org.apache.parquet.format.ColumnChunk)8 RowGroup (org.apache.parquet.format.RowGroup)8 ColumnMetaData (org.apache.parquet.format.ColumnMetaData)6 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)5 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)4 HashMap (java.util.HashMap)3 KeyValue (org.apache.parquet.format.KeyValue)3 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)3 IOException (java.io.IOException)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 ParquetCryptoRuntimeException (org.apache.parquet.crypto.ParquetCryptoRuntimeException)2 FileMetaData (org.apache.parquet.format.FileMetaData)2 SchemaElement (org.apache.parquet.format.SchemaElement)2 Util.writeColumnMetaData (org.apache.parquet.format.Util.writeColumnMetaData)2 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)2 IndexReference (org.apache.parquet.internal.hadoop.metadata.IndexReference)2 ParquetDecodingException (org.apache.parquet.io.ParquetDecodingException)2 MessageType (org.apache.parquet.schema.MessageType)2 ParquetCorruptionException (com.facebook.presto.parquet.ParquetCorruptionException)1