Search in sources :

Example 1 with InvalidFileOffsetException

use of org.apache.parquet.io.InvalidFileOffsetException in project parquet-mr by apache.

the class ParquetMetadataConverter method filterFileMetaDataByStart.

// Visible for testing
static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) {
    List<RowGroup> rowGroups = metaData.getRow_groups();
    List<RowGroup> newRowGroups = new ArrayList<RowGroup>();
    long preStartIndex = 0;
    long preCompressedSize = 0;
    boolean firstColumnWithMetadata = true;
    if (rowGroups != null && rowGroups.size() > 0) {
        firstColumnWithMetadata = rowGroups.get(0).getColumns().get(0).isSetMeta_data();
    }
    for (RowGroup rowGroup : rowGroups) {
        long startIndex;
        ColumnChunk columnChunk = rowGroup.getColumns().get(0);
        if (firstColumnWithMetadata) {
            startIndex = getOffset(columnChunk);
        } else {
            assert rowGroup.isSetFile_offset();
            assert rowGroup.isSetTotal_compressed_size();
            // the file_offset of first block always holds the truth, while other blocks don't :
            // see PARQUET-2078 for details
            startIndex = rowGroup.getFile_offset();
            if (invalidFileOffset(startIndex, preStartIndex, preCompressedSize)) {
                // first row group's offset is always 4
                if (preStartIndex == 0) {
                    startIndex = 4;
                } else {
                    throw new InvalidFileOffsetException("corrupted RowGroup.file_offset found, " + "please use file range instead of block offset for split.");
                }
            }
            preStartIndex = startIndex;
            preCompressedSize = rowGroup.getTotal_compressed_size();
        }
        if (filter.contains(startIndex)) {
            newRowGroups.add(rowGroup);
        }
    }
    metaData.setRow_groups(newRowGroups);
    return metaData;
}
Also used : InvalidFileOffsetException(org.apache.parquet.io.InvalidFileOffsetException) RowGroup(org.apache.parquet.format.RowGroup) ArrayList(java.util.ArrayList) ColumnChunk(org.apache.parquet.format.ColumnChunk)

Aggregations

ArrayList (java.util.ArrayList)1 ColumnChunk (org.apache.parquet.format.ColumnChunk)1 RowGroup (org.apache.parquet.format.RowGroup)1 InvalidFileOffsetException (org.apache.parquet.io.InvalidFileOffsetException)1