use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class ParquetFileWriter method serializeOffsetIndexes.
private static void serializeOffsetIndexes(List<List<OffsetIndex>> offsetIndexes, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
LOG.debug("{}: offset indexes", out.getPos());
for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
BlockMetaData block = blocks.get(bIndex);
List<ColumnChunkMetaData> columns = block.getColumns();
List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
if (offsetIndex == null) {
continue;
}
ColumnChunkMetaData column = columns.get(cIndex);
BlockCipher.Encryptor offsetIndexEncryptor = null;
byte[] offsetIndexAAD = null;
if (null != fileEncryptor) {
InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
if (columnEncryptionSetup.isEncrypted()) {
offsetIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
offsetIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.OffsetIndex, block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
}
}
long offset = out.getPos();
Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out, offsetIndexEncryptor, offsetIndexAAD);
column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
}
}
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class ColumnIndexValidator method checkContractViolations.
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
List<ContractViolation> violations = new ArrayList<>();
try (ParquetFileReader reader = ParquetFileReader.open(file)) {
FileMetaData meta = reader.getFooter().getFileMetaData();
MessageType schema = meta.getSchema();
List<ColumnDescriptor> columns = schema.getColumns();
List<BlockMetaData> blocks = reader.getFooter().getBlocks();
int rowGroupNumber = 0;
PageReadStore rowGroup = reader.readNextRowGroup();
while (rowGroup != null) {
ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
assert (columnChunks.size() == columns.size());
for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
ColumnDescriptor column = columns.get(columnNumber);
ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
if (columnIndex == null) {
continue;
}
ColumnPath columnPath = columnChunk.getPath();
OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
List<ByteBuffer> minValues = columnIndex.getMinValues();
List<ByteBuffer> maxValues = columnIndex.getMaxValues();
BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
List<Long> nullCounts = columnIndex.getNullCounts();
List<Boolean> nullPages = columnIndex.getNullPages();
long rowNumber = 0;
ColumnReader columnReader = columnReadStore.getColumnReader(column);
ByteBuffer prevMinValue = null;
ByteBuffer prevMaxValue = null;
for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
boolean isNullPage = nullPages.get(pageNumber);
ByteBuffer minValue = minValues.get(pageNumber);
ByteBuffer maxValue = maxValues.get(pageNumber);
PageValidator pageValidator = new PageValidator(column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage);
if (!isNullPage) {
prevMinValue = minValue;
prevMaxValue = maxValue;
}
long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
while (rowNumber <= lastRowNumberInPage) {
pageValidator.validateValuesBelongingToRow();
++rowNumber;
}
pageValidator.finishPage();
}
}
rowGroup = reader.readNextRowGroup();
rowGroupNumber++;
}
}
return violations;
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project presto by prestodb.
the class ParquetReader method readPrimitive.
private ColumnChunk readPrimitive(PrimitiveField field) throws IOException {
ColumnDescriptor columnDescriptor = field.getDescriptor();
int fieldId = field.getId();
ColumnReader columnReader = columnReaders[fieldId];
if (!columnReader.isInitialized()) {
validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
long startingPosition = metadata.getStartingPos();
int totalSize = toIntExact(metadata.getTotalSize());
if (shouldUseColumnIndex(metadata.getPath())) {
OffsetIndex offsetIndex = blockIndexStores.get(currentBlock).getOffsetIndex(metadata.getPath());
OffsetIndex filteredOffsetIndex = ColumnIndexFilterUtils.filterOffsetIndex(offsetIndex, currentGroupRowRanges, blocks.get(currentBlock).getRowCount());
List<OffsetRange> offsetRanges = ColumnIndexFilterUtils.calculateOffsetRanges(filteredOffsetIndex, metadata, offsetIndex.getOffset(0), startingPosition);
List<OffsetRange> consecutiveRanges = concatRanges(offsetRanges);
List<ByteBuffer> buffers = allocateBlocks(consecutiveRanges);
for (int i = 0; i < consecutiveRanges.size(); i++) {
ByteBuffer buffer = buffers.get(i);
dataSource.readFully(startingPosition + consecutiveRanges.get(i).getOffset(), buffer.array());
}
PageReader pageReader = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex);
columnReader.init(pageReader, field, currentGroupRowRanges);
if (enableVerification) {
ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
PageReader pageReaderVerification = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex);
verificationColumnReader.init(pageReaderVerification, field, currentGroupRowRanges);
}
} else {
byte[] buffer = allocateBlock(totalSize);
dataSource.readFully(startingPosition, buffer);
PageReader pageReader = createPageReader(buffer, totalSize, metadata, columnDescriptor);
columnReader.init(pageReader, field, null);
if (enableVerification) {
ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
PageReader pageReaderVerification = createPageReader(buffer, totalSize, metadata, columnDescriptor);
verificationColumnReader.init(pageReaderVerification, field, null);
}
}
}
ColumnChunk columnChunk = columnReader.readNext();
columnChunk = typeCoercion(columnChunk, field.getDescriptor().getPrimitiveType().getPrimitiveTypeName(), field.getType());
if (enableVerification) {
ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
ColumnChunk expected = verificationColumnReader.readNext();
ParquetResultVerifierUtils.verifyColumnChunks(columnChunk, expected, columnDescriptor.getPath().length > 1, field, dataSource.getId());
}
// update max size per primitive column chunk
long bytesPerCell = columnChunk.getBlock().getSizeInBytes() / batchSize;
if (maxBytesPerCell[fieldId] < bytesPerCell) {
// update batch size
maxCombinedBytesPerRow = maxCombinedBytesPerRow - maxBytesPerCell[fieldId] + bytesPerCell;
maxBatchSize = toIntExact(min(maxBatchSize, max(1, maxReadBlockBytes / maxCombinedBytesPerRow)));
maxBytesPerCell[fieldId] = bytesPerCell;
}
return columnChunk;
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project drill by apache.
the class ParquetFileWriter method serializeOffsetIndexes.
private static void serializeOffsetIndexes(List<List<OffsetIndex>> offsetIndexes, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
LOG.debug("{}: offset indexes", out.getPos());
for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
BlockMetaData block = blocks.get(bIndex);
List<ColumnChunkMetaData> columns = block.getColumns();
List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
if (offsetIndex == null) {
continue;
}
ColumnChunkMetaData column = columns.get(cIndex);
BlockCipher.Encryptor offsetIndexEncryptor = null;
byte[] offsetIndexAAD = null;
if (null != fileEncryptor) {
InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
if (columnEncryptionSetup.isEncrypted()) {
offsetIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
offsetIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.OffsetIndex, block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
}
}
long offset = out.getPos();
Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out, offsetIndexEncryptor, offsetIndexAAD);
column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
}
}
}
use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.
the class ParquetFileReader method internalReadFilteredRowGroup.
private ColumnChunkPageReadStore internalReadFilteredRowGroup(BlockMetaData block, RowRanges rowRanges, ColumnIndexStore ciStore) throws IOException {
ColumnChunkPageReadStore rowGroup = new ColumnChunkPageReadStore(rowRanges);
// prepare the list of consecutive parts to read them in one scan
ChunkListBuilder builder = new ChunkListBuilder(block.getRowCount());
List<ConsecutivePartList> allParts = new ArrayList<>();
ConsecutivePartList currentParts = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());
OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount());
for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
BenchmarkCounter.incrementTotalBytes(range.getLength());
long startingPos = range.getOffset();
// first part or not consecutive => new list
if (currentParts == null || currentParts.endPos() != startingPos) {
currentParts = new ConsecutivePartList(startingPos);
allParts.add(currentParts);
}
ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos, range.getLength());
currentParts.addChunk(chunkDescriptor);
builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
}
}
}
// actually read all the chunks
for (ConsecutivePartList consecutiveChunks : allParts) {
consecutiveChunks.readAll(f, builder);
}
for (Chunk chunk : builder.build()) {
readChunkPages(chunk, block, rowGroup);
}
return rowGroup;
}
Aggregations