use of org.apache.parquet.column.ColumnReadStore in project parquet-mr by apache.
the class ColumnIndexValidator method checkContractViolations.
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
List<ContractViolation> violations = new ArrayList<>();
try (ParquetFileReader reader = ParquetFileReader.open(file)) {
FileMetaData meta = reader.getFooter().getFileMetaData();
MessageType schema = meta.getSchema();
List<ColumnDescriptor> columns = schema.getColumns();
List<BlockMetaData> blocks = reader.getFooter().getBlocks();
int rowGroupNumber = 0;
PageReadStore rowGroup = reader.readNextRowGroup();
while (rowGroup != null) {
ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
assert (columnChunks.size() == columns.size());
for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
ColumnDescriptor column = columns.get(columnNumber);
ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
if (columnIndex == null) {
continue;
}
ColumnPath columnPath = columnChunk.getPath();
OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
List<ByteBuffer> minValues = columnIndex.getMinValues();
List<ByteBuffer> maxValues = columnIndex.getMaxValues();
BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
List<Long> nullCounts = columnIndex.getNullCounts();
List<Boolean> nullPages = columnIndex.getNullPages();
long rowNumber = 0;
ColumnReader columnReader = columnReadStore.getColumnReader(column);
ByteBuffer prevMinValue = null;
ByteBuffer prevMaxValue = null;
for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
boolean isNullPage = nullPages.get(pageNumber);
ByteBuffer minValue = minValues.get(pageNumber);
ByteBuffer maxValue = maxValues.get(pageNumber);
PageValidator pageValidator = new PageValidator(column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage);
if (!isNullPage) {
prevMinValue = minValue;
prevMaxValue = maxValue;
}
long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
while (rowNumber <= lastRowNumberInPage) {
pageValidator.validateValuesBelongingToRow();
++rowNumber;
}
pageValidator.finishPage();
}
}
rowGroup = reader.readNextRowGroup();
rowGroupNumber++;
}
}
return violations;
}
Aggregations