use of org.apache.parquet.format.ColumnChunk in project presto by prestodb.
the class MetadataReader method readFooter.
public static ParquetFileMetadata readFooter(ParquetDataSource parquetDataSource, long fileSize) throws IOException {
// Parquet File Layout:
//
// MAGIC
// variable: Data
// variable: Metadata
// 4 bytes: MetadataLength
// MAGIC
validateParquet(fileSize >= MAGIC.length() + POST_SCRIPT_SIZE, "%s is not a valid Parquet File", parquetDataSource.getId());
// EXPECTED_FOOTER_SIZE is an int, so this will never fail
byte[] buffer = new byte[toIntExact(min(fileSize, EXPECTED_FOOTER_SIZE))];
parquetDataSource.readFully(fileSize - buffer.length, buffer);
Slice tailSlice = wrappedBuffer(buffer);
Slice magic = tailSlice.slice(tailSlice.length() - MAGIC.length(), MAGIC.length());
if (!MAGIC.equals(magic)) {
throw new ParquetCorruptionException(format("Not valid Parquet file: %s expected magic number: %s got: %s", parquetDataSource.getId(), Arrays.toString(MAGIC.getBytes()), Arrays.toString(magic.getBytes())));
}
int metadataLength = tailSlice.getInt(tailSlice.length() - POST_SCRIPT_SIZE);
int completeFooterSize = metadataLength + POST_SCRIPT_SIZE;
long metadataFileOffset = fileSize - completeFooterSize;
validateParquet(metadataFileOffset >= MAGIC.length() && metadataFileOffset + POST_SCRIPT_SIZE < fileSize, "Corrupted Parquet file: %s metadata index: %s out of range", parquetDataSource.getId(), metadataFileOffset);
// Ensure the slice covers the entire metadata range
if (tailSlice.length() < completeFooterSize) {
byte[] footerBuffer = new byte[completeFooterSize];
parquetDataSource.readFully(metadataFileOffset, footerBuffer, 0, footerBuffer.length - tailSlice.length());
// Copy the previous slice contents into the new buffer
tailSlice.getBytes(0, footerBuffer, footerBuffer.length - tailSlice.length(), tailSlice.length());
tailSlice = wrappedBuffer(footerBuffer, 0, footerBuffer.length);
}
FileMetaData fileMetaData = readFileMetaData(tailSlice.slice(tailSlice.length() - completeFooterSize, metadataLength).getInput());
List<SchemaElement> schema = fileMetaData.getSchema();
validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", parquetDataSource.getId());
MessageType messageType = readParquetSchema(schema);
List<BlockMetaData> blocks = new ArrayList<>();
List<RowGroup> rowGroups = fileMetaData.getRow_groups();
if (rowGroups != null) {
for (RowGroup rowGroup : rowGroups) {
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.setRowCount(rowGroup.getNum_rows());
blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size());
List<ColumnChunk> columns = rowGroup.getColumns();
validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup);
String filePath = columns.get(0).getFile_path();
for (ColumnChunk columnChunk : columns) {
validateParquet((filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file");
ColumnMetaData metaData = columnChunk.meta_data;
String[] path = metaData.path_in_schema.stream().map(value -> value.toLowerCase(Locale.ENGLISH)).toArray(String[]::new);
ColumnPath columnPath = ColumnPath.get(path);
PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType();
PrimitiveTypeName primitiveTypeName = primitiveType.getPrimitiveTypeName();
ColumnChunkMetaData column = ColumnChunkMetaData.get(columnPath, primitiveType, CompressionCodecName.fromParquet(metaData.codec), PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats), readEncodings(metaData.encodings), readStats(metaData.statistics, primitiveTypeName), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size);
column.setColumnIndexReference(toColumnIndexReference(columnChunk));
column.setOffsetIndexReference(toOffsetIndexReference(columnChunk));
blockMetaData.addColumn(column);
}
blockMetaData.setPath(filePath);
blocks.add(blockMetaData);
}
}
Map<String, String> keyValueMetaData = new HashMap<>();
List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata();
if (keyValueList != null) {
for (KeyValue keyValue : keyValueList) {
keyValueMetaData.put(keyValue.key, keyValue.value);
}
}
ParquetMetadata parquetMetadata = new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks);
return new ParquetFileMetadata(parquetMetadata, toIntExact(metadataLength));
}
use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.
the class ParquetMetadataConverter method addRowGroup.
private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block, InternalFileEncryptor fileEncryptor) {
// rowGroup.total_byte_size = ;
List<ColumnChunkMetaData> columns = block.getColumns();
List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
int rowGroupOrdinal = rowGroups.size();
int columnOrdinal = -1;
ByteArrayOutputStream tempOutStream = null;
for (ColumnChunkMetaData columnMetaData : columns) {
// verify this is the right offset
ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset());
// they are in the same file for now
columnChunk.file_path = block.getPath();
InternalColumnEncryptionSetup columnSetup = null;
boolean writeCryptoMetadata = false;
boolean encryptMetaData = false;
ColumnPath path = columnMetaData.getPath();
if (null != fileEncryptor) {
columnOrdinal++;
columnSetup = fileEncryptor.getColumnSetup(path, false, columnOrdinal);
writeCryptoMetadata = columnSetup.isEncrypted();
encryptMetaData = fileEncryptor.encryptColumnMetaData(columnSetup);
}
ColumnMetaData metaData = new ColumnMetaData(getType(columnMetaData.getType()), toFormatEncodings(columnMetaData.getEncodings()), Arrays.asList(columnMetaData.getPath().toArray()), toFormatCodec(columnMetaData.getCodec()), columnMetaData.getValueCount(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getTotalSize(), columnMetaData.getFirstDataPageOffset());
if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
metaData.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
}
long bloomFilterOffset = columnMetaData.getBloomFilterOffset();
if (bloomFilterOffset >= 0) {
metaData.setBloom_filter_offset(bloomFilterOffset);
}
if (columnMetaData.getStatistics() != null && !columnMetaData.getStatistics().isEmpty()) {
metaData.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
}
if (columnMetaData.getEncodingStats() != null) {
metaData.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
}
if (!encryptMetaData) {
columnChunk.setMeta_data(metaData);
} else {
// Serialize and encrypt ColumnMetadata separately
byte[] columnMetaDataAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.ColumnMetaData, rowGroupOrdinal, columnSetup.getOrdinal(), -1);
if (null == tempOutStream) {
tempOutStream = new ByteArrayOutputStream();
} else {
tempOutStream.reset();
}
try {
writeColumnMetaData(metaData, tempOutStream, columnSetup.getMetaDataEncryptor(), columnMetaDataAAD);
} catch (IOException e) {
throw new ParquetCryptoRuntimeException("Failed to serialize and encrypt ColumnMetadata for " + columnMetaData.getPath(), e);
}
columnChunk.setEncrypted_column_metadata(tempOutStream.toByteArray());
// Keep redacted metadata version for old readers
if (!fileEncryptor.isFooterEncrypted()) {
ColumnMetaData metaDataRedacted = metaData.deepCopy();
if (metaDataRedacted.isSetStatistics())
metaDataRedacted.unsetStatistics();
if (metaDataRedacted.isSetEncoding_stats())
metaDataRedacted.unsetEncoding_stats();
columnChunk.setMeta_data(metaDataRedacted);
}
}
if (writeCryptoMetadata) {
columnChunk.setCrypto_metadata(columnSetup.getColumnCryptoMetaData());
}
// columnChunk.meta_data.index_page_offset = ;
// columnChunk.meta_data.key_value_metadata = ; // nothing yet
IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
if (columnIndexRef != null) {
columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
columnChunk.setColumn_index_length(columnIndexRef.getLength());
}
IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
if (offsetIndexRef != null) {
columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
columnChunk.setOffset_index_length(offsetIndexRef.getLength());
}
parquetColumns.add(columnChunk);
}
RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
rowGroup.setFile_offset(block.getStartingPos());
rowGroup.setTotal_compressed_size(block.getCompressedSize());
rowGroup.setOrdinal((short) rowGroupOrdinal);
rowGroups.add(rowGroup);
}
use of org.apache.parquet.format.ColumnChunk in project parquet-mr by apache.
the class ParquetMetadataConverter method filterFileMetaDataByMidpoint.
// Visible for testing
static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) {
List<RowGroup> rowGroups = metaData.getRow_groups();
List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
long preStartIndex = 0;
long preCompressedSize = 0;
boolean firstColumnWithMetadata = true;
if (rowGroups != null && rowGroups.size() > 0) {
firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data();
}
for (RowGroup rowGroup : rowGroups) {
long totalSize = 0;
long startIndex;
ColumnChunk columnChunk = rowGroup.getColumns().get(0);
if (firstColumnWithMetadata) {
startIndex = getOffset(columnChunk);
} else {
assert rowGroup.isSetFile_offset();
assert rowGroup.isSetTotal_compressed_size();
// the file_offset of first block always holds the truth, while other blocks don't :
// see PARQUET-2078 for details
startIndex = rowGroup.getFile_offset();
if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) {
// first row group's offset is always 4
if (preStartIndex == 0) {
startIndex = 4;
} else {
// use minStartIndex(imprecise in case of padding, but good enough for filtering)
startIndex = preStartIndex + preCompressedSize;
}
}
preStartIndex = startIndex;
preCompressedSize = rowGroup.getTotal_compressed_size();
}
if (rowGroup.isSetTotal_compressed_size()) {
totalSize = rowGroup.getTotal_compressed_size();
} else {
for (ColumnChunk col : rowGroup.getColumns()) {
totalSize += col.getMeta_data().getTotal_compressed_size();
}
}
long midPoint = startIndex + totalSize / 2;
if (filter.contains(midPoint)) {
newRowGroups.add(rowGroup);
}
}
metaData.setRow_groups(newRowGroups);
return metaData;
}
Aggregations