Search in sources :

Example 6 with IndexReference

use of org.apache.parquet.internal.hadoop.metadata.IndexReference in project drill by apache.

the class ParquetFileWriter method serializeOffsetIndexes.

private static void serializeOffsetIndexes(List<List<OffsetIndex>> offsetIndexes, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
    LOG.debug("{}: offset indexes", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
        BlockMetaData block = blocks.get(bIndex);
        List<ColumnChunkMetaData> columns = block.getColumns();
        List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
        for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
            OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
            if (offsetIndex == null) {
                continue;
            }
            ColumnChunkMetaData column = columns.get(cIndex);
            BlockCipher.Encryptor offsetIndexEncryptor = null;
            byte[] offsetIndexAAD = null;
            if (null != fileEncryptor) {
                InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
                if (columnEncryptionSetup.isEncrypted()) {
                    offsetIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
                    offsetIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.OffsetIndex, block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
                }
            }
            long offset = out.getPos();
            Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out, offsetIndexEncryptor, offsetIndexAAD);
            column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BlockCipher(org.apache.parquet.format.BlockCipher) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 7 with IndexReference

use of org.apache.parquet.internal.hadoop.metadata.IndexReference in project drill by apache.

the class ParquetFileWriter method serializeColumnIndexes.

private static void serializeColumnIndexes(List<List<ColumnIndex>> columnIndexes, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
    LOG.debug("{}: column indexes", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
        BlockMetaData block = blocks.get(bIndex);
        List<ColumnChunkMetaData> columns = block.getColumns();
        List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
        for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
            ColumnChunkMetaData column = columns.get(cIndex);
            org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter.toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
            if (columnIndex == null) {
                continue;
            }
            BlockCipher.Encryptor columnIndexEncryptor = null;
            byte[] columnIndexAAD = null;
            if (null != fileEncryptor) {
                InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
                if (columnEncryptionSetup.isEncrypted()) {
                    columnIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
                    columnIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.ColumnIndex, block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
                }
            }
            long offset = out.getPos();
            Util.writeColumnIndex(columnIndex, out, columnIndexEncryptor, columnIndexAAD);
            column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BlockCipher(org.apache.parquet.format.BlockCipher) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference)

Example 8 with IndexReference

use of org.apache.parquet.internal.hadoop.metadata.IndexReference in project parquet-mr by apache.

the class ParquetFileReader method readOffsetIndex.

/**
 * @param column
 *          the column chunk which the offset index is to be returned for
 * @return the offset index for the specified column chunk or {@code null} if there is no index
 * @throws IOException
 *           if any I/O error occurs during reading the file
 */
@Private
public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException {
    IndexReference ref = column.getOffsetIndexReference();
    if (ref == null) {
        return null;
    }
    f.seek(ref.getOffset());
    BlockCipher.Decryptor offsetIndexDecryptor = null;
    byte[] offsetIndexAAD = null;
    if (null != fileDecryptor && !fileDecryptor.plaintextFile()) {
        InternalColumnDecryptionSetup columnDecryptionSetup = fileDecryptor.getColumnSetup(column.getPath());
        if (columnDecryptionSetup.isEncrypted()) {
            offsetIndexDecryptor = columnDecryptionSetup.getMetaDataDecryptor();
            offsetIndexAAD = AesCipher.createModuleAAD(fileDecryptor.getFileAAD(), ModuleType.OffsetIndex, column.getRowGroupOrdinal(), columnDecryptionSetup.getOrdinal(), -1);
        }
    }
    return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f, offsetIndexDecryptor, offsetIndexAAD));
}
Also used : BlockCipher(org.apache.parquet.format.BlockCipher) InternalColumnDecryptionSetup(org.apache.parquet.crypto.InternalColumnDecryptionSetup) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference) Private(org.apache.yetus.audience.InterfaceAudience.Private)

Example 9 with IndexReference

use of org.apache.parquet.internal.hadoop.metadata.IndexReference in project parquet-mr by apache.

the class ParquetMetadataConverter method addRowGroup.

private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block, InternalFileEncryptor fileEncryptor) {
    // rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    int rowGroupOrdinal = rowGroups.size();
    int columnOrdinal = -1;
    ByteArrayOutputStream tempOutStream = null;
    for (ColumnChunkMetaData columnMetaData : columns) {
        // verify this is the right offset
        ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset());
        // they are in the same file for now
        columnChunk.file_path = block.getPath();
        InternalColumnEncryptionSetup columnSetup = null;
        boolean writeCryptoMetadata = false;
        boolean encryptMetaData = false;
        ColumnPath path = columnMetaData.getPath();
        if (null != fileEncryptor) {
            columnOrdinal++;
            columnSetup = fileEncryptor.getColumnSetup(path, false, columnOrdinal);
            writeCryptoMetadata = columnSetup.isEncrypted();
            encryptMetaData = fileEncryptor.encryptColumnMetaData(columnSetup);
        }
        ColumnMetaData metaData = new ColumnMetaData(getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), toFormatCodec(columnMetaData.getCodec()), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset());
        if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
            metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
        }
        long bloomFilterOffset = columnMetaData.getBloomFilterOffset();
        if (bloomFilterOffset >= 0) {
            metaData.setBloom_filter_offset(bloomFilterOffset);
        }
        if (columnMetaData.getStatistics() != null && !columnMetaData.getStatistics().isEmpty()) {
            metaData.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
        }
        if (columnMetaData.getEncodingStats() != null) {
            metaData.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
        }
        if (!encryptMetaData) {
            columnChunk.setMeta_data(metaData);
        } else {
            // Serialize and encrypt ColumnMetadata separately
            byte[] columnMetaDataAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.ColumnMetaData, rowGroupOrdinal, columnSetup.getOrdinal(), -1);
            if (null == tempOutStream) {
                tempOutStream = new ByteArrayOutputStream();
            } else {
                tempOutStream.reset();
            }
            try {
                writeColumnMetaData(metaData, tempOutStream, columnSetup.getMetaDataEncryptor(), columnMetaDataAAD);
            } catch (IOException e) {
                throw new ParquetCryptoRuntimeException("Failed to serialize and encrypt ColumnMetadata for " + columnMetaData.getPath(), e);
            }
            columnChunk.setEncrypted_column_metadata(tempOutStream.toByteArray());
            // Keep redacted metadata version for old readers
            if (!fileEncryptor.isFooterEncrypted()) {
                ColumnMetaData metaDataRedacted = metaData.deepCopy();
                if (metaDataRedacted.isSetStatistics())
                    metaDataRedacted.unsetStatistics();
                if (metaDataRedacted.isSetEncoding_stats())
                    metaDataRedacted.unsetEncoding_stats();
                columnChunk.setMeta_data(metaDataRedacted);
            }
        }
        if (writeCryptoMetadata) {
            columnChunk.setCrypto_metadata(columnSetup.getColumnCryptoMetaData());
        }
        // columnChunk.meta_data.index_page_offset = ;
        // columnChunk.meta_data.key_value_metadata = ; // nothing yet
        IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
        if (columnIndexRef != null) {
            columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
            columnChunk.setColumn_index_length(columnIndexRef.getLength());
        }
        IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
        if (offsetIndexRef != null) {
            columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
            columnChunk.setOffset_index_length(offsetIndexRef.getLength());
        }
        parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroup.setFile_offset(block.getStartingPos());
    rowGroup.setTotal_compressed_size(block.getCompressedSize());
    rowGroup.setOrdinal((short) rowGroupOrdinal);
    rowGroups.add(rowGroup);
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ParquetCryptoRuntimeException(org.apache.parquet.crypto.ParquetCryptoRuntimeException) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) IOException(java.io.IOException) ColumnChunk(org.apache.parquet.format.ColumnChunk) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) Util.writeColumnMetaData(org.apache.parquet.format.Util.writeColumnMetaData) ColumnMetaData(org.apache.parquet.format.ColumnMetaData) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference)

Aggregations

IndexReference (org.apache.parquet.internal.hadoop.metadata.IndexReference)9 BlockCipher (org.apache.parquet.format.BlockCipher)6 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)5 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)5 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)4 InternalColumnDecryptionSetup (org.apache.parquet.crypto.InternalColumnDecryptionSetup)2 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)2 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)2 Private (org.apache.yetus.audience.InterfaceAudience.Private)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 ParquetCryptoRuntimeException (org.apache.parquet.crypto.ParquetCryptoRuntimeException)1 ColumnChunk (org.apache.parquet.format.ColumnChunk)1 ColumnMetaData (org.apache.parquet.format.ColumnMetaData)1 RowGroup (org.apache.parquet.format.RowGroup)1 Util.writeColumnMetaData (org.apache.parquet.format.Util.writeColumnMetaData)1 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)1