use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.
the class ParquetMetadataConverter method fromParquetMetadata.
public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata, InternalFileDecryptor fileDecryptor, boolean encryptedFooter) throws IOException {
MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
List<RowGroup> row_groups = parquetMetadata.getRow_groups();
if (row_groups != null) {
for (RowGroup rowGroup : row_groups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
// not set in legacy files
if (rowGroup.isSetOrdinal()) {
blockMetaData.setOrdinal(rowGroup.getOrdinal());
}
List<ColumnChunk> columns = rowGroup.getColumns();
String filePath = columns.get(0).getFile_path();
int columnOrdinal = -1;
for (ColumnChunk columnChunk : columns) {
columnOrdinal++;
if ((filePath == null && columnChunk.getFile_path() != null) || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
}
ColumnMetaData metaData = columnChunk.meta_data;
ColumnCryptoMetaData cryptoMetaData = columnChunk.getCrypto_metadata();
ColumnChunkMetaData column = null;
ColumnPath columnPath = null;
boolean encryptedMetadata = false;
if (null == cryptoMetaData) {
// Plaintext column
columnPath = getPath(metaData);
if (null != fileDecryptor && !fileDecryptor.plaintextFile()) {
// mark this column as plaintext in encrypted file decryptor
fileDecryptor.setColumnCryptoMetadata(columnPath, false, false, (byte[]) null, columnOrdinal);
}
} else {
// Encrypted column
boolean encryptedWithFooterKey = cryptoMetaData.isSetENCRYPTION_WITH_FOOTER_KEY();
if (encryptedWithFooterKey) {
// Column encrypted with footer key
if (!encryptedFooter) {
throw new ParquetCryptoRuntimeException("Column encrypted with footer key in file with plaintext footer");
}
if (null == metaData) {
throw new ParquetCryptoRuntimeException("ColumnMetaData not set in Encryption with Footer key");
}
if (null == fileDecryptor) {
throw new ParquetCryptoRuntimeException("Column encrypted with footer key: No keys available");
}
columnPath = getPath(metaData);
fileDecryptor.setColumnCryptoMetadata(columnPath, true, true, (byte[]) null, columnOrdinal);
} else {
// Column encrypted with column key
// setColumnCryptoMetadata triggers KMS interaction, hence delayed until this column is projected
encryptedMetadata = true;
}
}
String createdBy = parquetMetadata.getCreated_by();
if (!encryptedMetadata) {
// unencrypted column, or encrypted with footer key
column = buildColumnChunkMetaData(metaData, columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType(), createdBy);
column.setRowGroupOrdinal(rowGroup.getOrdinal());
if (metaData.isSetBloom_filter_offset()) {
column.setBloomFilterOffset(metaData.getBloom_filter_offset());
}
} else {
// column encrypted with column key
// Metadata will be decrypted later, if this column is accessed
EncryptionWithColumnKey columnKeyStruct = cryptoMetaData.getENCRYPTION_WITH_COLUMN_KEY();
List<String> pathList = columnKeyStruct.getPath_in_schema();
byte[] columnKeyMetadata = columnKeyStruct.getKey_metadata();
columnPath = ColumnPath.get(pathList.toArray(new String[pathList.size()]));
byte[] encryptedMetadataBuffer = columnChunk.getEncrypted_column_metadata();
column = ColumnChunkMetaData.getWithEncryptedMetadata(this, columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType(), encryptedMetadataBuffer, columnKeyMetadata, fileDecryptor, rowGroup.getOrdinal(), columnOrdinal, createdBy);
}
column.setColumnIndexReference(toColumnIndexReference(columnChunk));
column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
// TODO
// index_page_offset
// key_value_metadata
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<String, String>();
List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
if (key_value_metadata != null) {
for (KeyValue keyValue : key_value_metadata) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by(), fileDecryptor), blocks);
}
use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.
the class ParquetMetadataConverter method filterFileMetaDataByStart.
// Visible for testing
static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) {
List<RowGroup> rowGroups = metaData.getRow_groups();
List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
long preStartIndex = 0;
long preCompressedSize = 0;
boolean firstColumnWithMetadata = true;
if (rowGroups != null && rowGroups.size() > 0) {
firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data();
}
for (RowGroup rowGroup : rowGroups) {
long startIndex;
ColumnChunk columnChunk = rowGroup.getColumns().get(0);
if (firstColumnWithMetadata) {
startIndex = getOffset(columnChunk);
} else {
assert rowGroup.isSetFile_offset();
assert rowGroup.isSetTotal_compressed_size();
// the file_offset of first block always holds the truth, while other blocks don't :
// see PARQUET-2078 for details
startIndex = rowGroup.getFile_offset();
if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) {
// first row group's offset is always 4
if (preStartIndex == 0) {
startIndex = 4;
} else {
throw new InvalidFileOffsetException("corrupted RowGroup.file_offset found, " + "please use file range instead of block offset for split.");
}
}
preStartIndex = startIndex;
preCompressedSize = rowGroup.getTotal_compressed_size();
}
if (filter.contains(startIndex)) {
newRowGroups.add(rowGroup);
}
}
metaData.setRow_groups(newRowGroups);
return metaData;
}
use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.
the class TestParquetMetadataConverter method metadata.
private FileMetaData metadata(long... sizes) {
List<SchemaElement> schema = emptyList();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
long offset = 0;
for (long size : sizes) {
ColumnChunk columnChunk = new ColumnChunk(offset);
columnChunk.setMeta_data(new ColumnMetaData(INT32, Collections.<org.apache.parquet.format.Encoding>emptyList(), Collections.<String>emptyList(), UNCOMPRESSED, 10l, size * 2, size, offset));
rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
offset += size;
}
return new FileMetaData(1, schema, sizes.length, rowGroups);
}
use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.
the class ParquetMetadataConverter method fromParquetMetadata.
public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException {
MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders());
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
List<RowGroup> row_groups = parquetMetadata.getRow_groups();
if (row_groups != null) {
for (RowGroup rowGroup : row_groups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List<ColumnChunk> columns = rowGroup.getColumns();
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
if ((filePath == null && columnChunk.getFile_path() != null) || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) {
throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now");
}
ColumnMetaData metaData = columnChunk.meta_data;
ColumnPath path = getPath(metaData);
ColumnChunkMetaData column = ColumnChunkMetaData.get(path, messageType.getType(path.toArray()).asPrimitiveType(), fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), fromParquetStatistics(parquetMetadata.getCreated_by(), metaData.statistics, messageType.getType(path.toArray()).asPrimitiveType()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
// TODO
// index_page_offset
// key_value_metadata
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<String, String>();
List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata();
if (key_value_metadata != null) {
for (KeyValue keyValue : key_value_metadata) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()), blocks);
}
use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.
the class ParquetMetadataConverter method addRowGroup.
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
// rowGroup.total_byte_size = ;
List<ColumnChunkMetaData> columns = block.getColumns();
List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
for (ColumnChunkMetaData columnMetaData : columns) {
// verify this is the right offset
ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset());
// they are in the same file for now
columnChunk.file_path = block.getPath();
columnChunk.meta_data = new ColumnMetaData(getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), toFormatCodec(columnMetaData.getCodec()), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset());
columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset();
if (!columnMetaData.getStatistics().isEmpty()) {
columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics()));
}
if (columnMetaData.getEncodingStats() != null) {
columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
}
// columnChunk.meta_data.index_page_offset = ;
// columnChunk.meta_data.key_value_metadata = ; // nothing yet
parquetColumns.add(columnChunk);
}
RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
rowGroups.add(rowGroup);
}
Aggregations