use of org.apache.parquet.io.InvalidFileOffsetException in project parquet-mr by apache.
the class ParquetMetadataConverter method filterFileMetaDataByStart.
// Visible for testing
static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) {
List<RowGroup> rowGroups = metaData.getRow_groups();
List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
long preStartIndex = 0;
long preCompressedSize = 0;
boolean firstColumnWithMetadata = true;
if (rowGroups != null && rowGroups.size() > 0) {
firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data();
}
for (RowGroup rowGroup : rowGroups) {
long startIndex;
ColumnChunk columnChunk = rowGroup.getColumns().get(0);
if (firstColumnWithMetadata) {
startIndex = getOffset(columnChunk);
} else {
assert rowGroup.isSetFile_offset();
assert rowGroup.isSetTotal_compressed_size();
// the file_offset of first block always holds the truth, while other blocks don't :
// see PARQUET-2078 for details
startIndex = rowGroup.getFile_offset();
if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) {
// first row group's offset is always 4
if (preStartIndex == 0) {
startIndex = 4;
} else {
throw new InvalidFileOffsetException("corrupted RowGroup.file_offset found, " + "please use file range instead of block offset for split.");
}
}
preStartIndex = startIndex;
preCompressedSize = rowGroup.getTotal_compressed_size();
}
if (filter.contains(startIndex)) {
newRowGroups.add(rowGroup);
}
}
metaData.setRow_groups(newRowGroups);
return metaData;
}
Aggregations