Search in sources :

Example 1 with ColumnChunk

use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.

the class ParquetMetadataConverter method fromParquetMetadata.

public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata, InternalFileDecryptor fileDecryptor, boolean encryptedFooter) throws IOException {
    MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    List<RowGroup> row_groups = parquetMetadata.getRow_groups();
    if (row_groups != null) {
        for (RowGroup rowGroup : row_groups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            // not set in legacy files
            if (rowGroup.isSetOrdinal()) {
                blockMetaData.setOrdinal(rowGroup.getOrdinal());
            }
            List<ColumnChunk> columns = rowGroup.getColumns();
            String filePath = columns.get(0).getFile_path();
            int columnOrdinal = -1;
            for (ColumnChunk columnChunk : columns) {
                columnOrdinal++;
                if ((filePath == null && columnChunk.getFile_path() != null) || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
                    throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
                }
                ColumnMetaData metaData = columnChunk.meta_data;
                ColumnCryptoMetaData cryptoMetaData = columnChunk.getCrypto_metadata();
                ColumnChunkMetaData column = null;
                ColumnPath columnPath = null;
                boolean encryptedMetadata = false;
                if (null == cryptoMetaData) {
                    // Plaintext column
                    columnPath = getPath(metaData);
                    if (null != fileDecryptor && !fileDecryptor.plaintextFile()) {
                        // mark this column as plaintext in encrypted file decryptor
                        fileDecryptor.setColumnCryptoMetadata(columnPath, false, false, (byte[]) null, columnOrdinal);
                    }
                } else {
                    // Encrypted column
                    boolean encryptedWithFooterKey = cryptoMetaData.isSetENCRYPTION_WITH_FOOTER_KEY();
                    if (encryptedWithFooterKey) {
                        // Column encrypted with footer key
                        if (!encryptedFooter) {
                            throw new ParquetCryptoRuntimeException("Column encrypted with footer key in file with plaintext footer");
                        }
                        if (null == metaData) {
                            throw new ParquetCryptoRuntimeException("ColumnMetaData not set in Encryption with Footer key");
                        }
                        if (null == fileDecryptor) {
                            throw new ParquetCryptoRuntimeException("Column encrypted with footer key: No keys available");
                        }
                        columnPath = getPath(metaData);
                        fileDecryptor.setColumnCryptoMetadata(columnPath, true, true, (byte[]) null, columnOrdinal);
                    } else {
                        // Column encrypted with column key
                        // setColumnCryptoMetadata triggers KMS interaction, hence delayed until this column is projected
                        encryptedMetadata = true;
                    }
                }
                String createdBy = parquetMetadata.getCreated_by();
                if (!encryptedMetadata) {
                    // unencrypted column, or encrypted with footer key
                    column = buildColumnChunkMetaData(metaData, columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType(), createdBy);
                    column.setRowGroupOrdinal(rowGroup.getOrdinal());
                    if (metaData.isSetBloom_filter_offset()) {
                        column.setBloomFilterOffset(metaData.getBloom_filter_offset());
                    }
                } else {
                    // column encrypted with column key
                    // Metadata will be decrypted later, if this column is accessed
                    EncryptionWithColumnKey columnKeyStruct = cryptoMetaData.getENCRYPTION_WITH_COLUMN_KEY();
                    List<String> pathList = columnKeyStruct.getPath_in_schema();
                    byte[] columnKeyMetadata = columnKeyStruct.getKey_metadata();
                    columnPath = ColumnPath.get(pathList.toArray(new String[pathList.size()]));
                    byte[] encryptedMetadataBuffer = columnChunk.getEncrypted_column_metadata();
                    column = ColumnChunkMetaData.getWithEncryptedMetadata(this, columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType(), encryptedMetadataBuffer, columnKeyMetadata, fileDecryptor, rowGroup.getOrdinal(), columnOrdinal, createdBy);
                }
                column.setColumnIndexReference(toColumnIndexReference(columnChunk));
                column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
                // TODO
                // index_page_offset
                // key_value_metadata
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }
    Map<String, String> keyValueMetaData = new HashMap<String, String>();
    List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
    if (key_value_metadata != null) {
        for (KeyValue keyValue : key_value_metadata) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by(), fileDecryptor), blocks);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) KeyValue(org.apache.parquet.format.KeyValue) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ParquetCryptoRuntimeException(org.apache.parquet.crypto.ParquetCryptoRuntimeException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(org.apache.parquet.format.ColumnChunk) Util.writeColumnMetaData(org.apache.parquet.format.Util.writeColumnMetaData) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) MessageType(org.apache.parquet.schema.MessageType) ColumnCryptoMetaData(org.apache.parquet.format.ColumnCryptoMetaData) EncryptionWithColumnKey(org.apache.parquet.format.EncryptionWithColumnKey) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath)

Example 2 with ColumnChunk

use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.

the class ParquetMetadataConverter method filterFileMetaDataByStart.

// Visible for testing
static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) {
    List<RowGroup> rowGroups = metaData.getRow_groups();
    List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
    long preStartIndex = 0;
    long preCompressedSize = 0;
    boolean firstColumnWithMetadata = true;
    if (rowGroups != null && rowGroups.size() > 0) {
        firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data();
    }
    for (RowGroup rowGroup : rowGroups) {
        long startIndex;
        ColumnChunk columnChunk = rowGroup.getColumns().get(0);
        if (firstColumnWithMetadata) {
            startIndex = getOffset(columnChunk);
        } else {
            assert rowGroup.isSetFile_offset();
            assert rowGroup.isSetTotal_compressed_size();
            // the file_offset of first block always holds the truth, while other blocks don't :
            // see PARQUET-2078 for details
            startIndex = rowGroup.getFile_offset();
            if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) {
                // first row group's offset is always 4
                if (preStartIndex == 0) {
                    startIndex = 4;
                } else {
                    throw new InvalidFileOffsetException("corrupted RowGroup.file_offset found, " + "please use file range instead of block offset for split.");
                }
            }
            preStartIndex = startIndex;
            preCompressedSize = rowGroup.getTotal_compressed_size();
        }
        if (filter.contains(startIndex)) {
            newRowGroups.add(rowGroup);
        }
    }
    metaData.setRow_groups(newRowGroups);
    return metaData;
}
Also used : InvalidFileOffsetException(org.apache.parquet.io.InvalidFileOffsetException) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(org.apache.parquet.format.ColumnChunk)

Example 3 with ColumnChunk

use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.

the class TestParquetMetadataConverter method metadata.

private FileMetaData metadata(long... sizes) {
    List<SchemaElement> schema = emptyList();
    List<RowGroup> rowGroups = new ArrayList<RowGroup>();
    long offset = 0;
    for (long size : sizes) {
        ColumnChunk columnChunk = new ColumnChunk(offset);
        columnChunk.setMeta_data(new ColumnMetaData(INT32, Collections.<org.apache.parquet.format.Encoding>emptyList(), Collections.<String>emptyList(), UNCOMPRESSED, 10l, size * 2, size, offset));
        rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
        offset += size;
    }
    return new FileMetaData(1, schema, sizes.length, rowGroups);
}
Also used : RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) Encoding(org.apache.parquet.column.Encoding) ColumnChunk(org.apache.parquet.format.ColumnChunk) SchemaElement(org.apache.parquet.format.SchemaElement) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) FileMetaData(org.apache.parquet.format.FileMetaData)

Example 4 with ColumnChunk

use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.

the class ParquetMetadataConverter method fromParquetMetadata.

public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
    MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    List<RowGroup> row_groups = parquetMetadata.getRow_groups();
    if (row_groups != null) {
        for (RowGroup rowGroup : row_groups) {
            BlockMetaData blockMetaData = new BlockMetaData();
            blockMetaData.setRowCount(rowGroup.getNum_rows());
            blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
            List<ColumnChunk> columns = rowGroup.getColumns();
            String filePath = columns.get(0).getFile_path();
            for (ColumnChunk columnChunk : columns) {
                if ((filePath == null && columnChunk.getFile_path() != null) || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
                    throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
                }
                ColumnMetaData metaData = columnChunk.meta_data;
                ColumnPath path = getPath(metaData);
                ColumnChunkMetaData column = ColumnChunkMetaData.get(path, messageType.getType(path.toArray()).asPrimitiveType(), fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), fromParquetStatistics(parquetMetadata.getCreated_by(), metaData.statistics, messageType.getType(path.toArray()).asPrimitiveType()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
                // TODO
                // index_page_offset
                // key_value_metadata
                blockMetaData.addColumn(column);
            }
            blockMetaData.setPath(filePath);
            blocks.add(blockMetaData);
        }
    }
    Map<String, String> keyValueMetaData = new HashMap<String, String>();
    List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
    if (key_value_metadata != null) {
        for (KeyValue keyValue : key_value_metadata) {
            keyValueMetaData.put(keyValue.key, keyValue.value);
        }
    }
    return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()), blocks);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) KeyValue(org.apache.parquet.format.KeyValue) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ColumnChunk(org.apache.parquet.format.ColumnChunk) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) MessageType(org.apache.parquet.schema.MessageType)

Example 5 with ColumnChunk

use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.

the class ParquetMetadataConverter method addRowGroup.

private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
    // rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    for (ColumnChunkMetaData columnMetaData : columns) {
        // verify this is the right offset
        ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset());
        // they are in the same file for now
        columnChunk.file_path = block.getPath();
        columnChunk.meta_data = new ColumnMetaData(getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), toFormatCodec(columnMetaData.getCodec()), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset());
        columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset();
        if (!columnMetaData.getStatistics().isEmpty()) {
            columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics()));
        }
        if (columnMetaData.getEncodingStats() != null) {
            columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
        }
        // columnChunk.meta_data.index_page_offset = ;
        // columnChunk.meta_data.key_value_metadata = ; // nothing yet
        parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroups.add(rowGroup);
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) ColumnChunk(org.apache.parquet.format.ColumnChunk)

Aggregations

ArrayList (java.util.ArrayList)8 ColumnChunk (org.apache.parquet.format.ColumnChunk)8 RowGroup (org.apache.parquet.format.RowGroup)8 ColumnMetaData (org.apache.parquet.format.ColumnMetaData)6 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)5 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)4 HashMap (java.util.HashMap)3 KeyValue (org.apache.parquet.format.KeyValue)3 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)3 IOException (java.io.IOException)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 ParquetCryptoRuntimeException (org.apache.parquet.crypto.ParquetCryptoRuntimeException)2 FileMetaData (org.apache.parquet.format.FileMetaData)2 SchemaElement (org.apache.parquet.format.SchemaElement)2 Util.writeColumnMetaData (org.apache.parquet.format.Util.writeColumnMetaData)2 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)2 IndexReference (org.apache.parquet.internal.hadoop.metadata.IndexReference)2 ParquetDecodingException (org.apache.parquet.io.ParquetDecodingException)2 MessageType (org.apache.parquet.schema.MessageType)2 ParquetCorruptionException (com.facebook.presto.parquet.ParquetCorruptionException)1