Search in sources :

Example 1 with ColumnReadStore

use of org.apache.parquet.column.ColumnReadStore in project parquet-mr by apache.

the class ColumnIndexValidator method checkContractViolations.

public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
    List<ContractViolation> violations = new ArrayList<>();
    try (ParquetFileReader reader = ParquetFileReader.open(file)) {
        FileMetaData meta = reader.getFooter().getFileMetaData();
        MessageType schema = meta.getSchema();
        List<ColumnDescriptor> columns = schema.getColumns();
        List<BlockMetaData> blocks = reader.getFooter().getBlocks();
        int rowGroupNumber = 0;
        PageReadStore rowGroup = reader.readNextRowGroup();
        while (rowGroup != null) {
            ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
            List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
            assert (columnChunks.size() == columns.size());
            for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
                ColumnDescriptor column = columns.get(columnNumber);
                ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
                ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
                if (columnIndex == null) {
                    continue;
                }
                ColumnPath columnPath = columnChunk.getPath();
                OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
                List<ByteBuffer> minValues = columnIndex.getMinValues();
                List<ByteBuffer> maxValues = columnIndex.getMaxValues();
                BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
                List<Long> nullCounts = columnIndex.getNullCounts();
                List<Boolean> nullPages = columnIndex.getNullPages();
                long rowNumber = 0;
                ColumnReader columnReader = columnReadStore.getColumnReader(column);
                ByteBuffer prevMinValue = null;
                ByteBuffer prevMaxValue = null;
                for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
                    boolean isNullPage = nullPages.get(pageNumber);
                    ByteBuffer minValue = minValues.get(pageNumber);
                    ByteBuffer maxValue = maxValues.get(pageNumber);
                    PageValidator pageValidator = new PageValidator(column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage);
                    if (!isNullPage) {
                        prevMinValue = minValue;
                        prevMaxValue = maxValue;
                    }
                    long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
                    while (rowNumber <= lastRowNumberInPage) {
                        pageValidator.validateValuesBelongingToRow();
                        ++rowNumber;
                    }
                    pageValidator.finishPage();
                }
            }
            rowGroup = reader.readNextRowGroup();
            rowGroupNumber++;
        }
    }
    return violations;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) ArrayList(java.util.ArrayList) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) PageReadStore(org.apache.parquet.column.page.PageReadStore) BoundaryOrder(org.apache.parquet.internal.column.columnindex.BoundaryOrder) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ByteBuffer(java.nio.ByteBuffer) ColumnReader(org.apache.parquet.column.ColumnReader) ColumnReadStore(org.apache.parquet.column.ColumnReadStore) DummyRecordConverter(org.apache.parquet.example.DummyRecordConverter)

Aggregations

ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 ColumnReadStore (org.apache.parquet.column.ColumnReadStore)1 ColumnReader (org.apache.parquet.column.ColumnReader)1 ColumnReadStoreImpl (org.apache.parquet.column.impl.ColumnReadStoreImpl)1 PageReadStore (org.apache.parquet.column.page.PageReadStore)1 DummyRecordConverter (org.apache.parquet.example.DummyRecordConverter)1 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)1 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)1 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)1 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)1 BoundaryOrder (org.apache.parquet.internal.column.columnindex.BoundaryOrder)1 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)1 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)1 MessageType (org.apache.parquet.schema.MessageType)1