Search in sources :

Example 6 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class ParquetFileWriter method serializeOffsetIndexes.

private static void serializeOffsetIndexes(List<List<OffsetIndex>> offsetIndexes, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
    LOG.debug("{}: offset indexes", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
        BlockMetaData block = blocks.get(bIndex);
        List<ColumnChunkMetaData> columns = block.getColumns();
        List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
        for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
            OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
            if (offsetIndex == null) {
                continue;
            }
            ColumnChunkMetaData column = columns.get(cIndex);
            BlockCipher.Encryptor offsetIndexEncryptor = null;
            byte[] offsetIndexAAD = null;
            if (null != fileEncryptor) {
                InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
                if (columnEncryptionSetup.isEncrypted()) {
                    offsetIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
                    offsetIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.OffsetIndex, block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
                }
            }
            long offset = out.getPos();
            Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out, offsetIndexEncryptor, offsetIndexAAD);
            column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BlockCipher(org.apache.parquet.format.BlockCipher) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 7 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class ColumnIndexValidator method checkContractViolations.

public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
    List<ContractViolation> violations = new ArrayList<>();
    try (ParquetFileReader reader = ParquetFileReader.open(file)) {
        FileMetaData meta = reader.getFooter().getFileMetaData();
        MessageType schema = meta.getSchema();
        List<ColumnDescriptor> columns = schema.getColumns();
        List<BlockMetaData> blocks = reader.getFooter().getBlocks();
        int rowGroupNumber = 0;
        PageReadStore rowGroup = reader.readNextRowGroup();
        while (rowGroup != null) {
            ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
            List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
            assert (columnChunks.size() == columns.size());
            for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
                ColumnDescriptor column = columns.get(columnNumber);
                ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
                ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
                if (columnIndex == null) {
                    continue;
                }
                ColumnPath columnPath = columnChunk.getPath();
                OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
                List<ByteBuffer> minValues = columnIndex.getMinValues();
                List<ByteBuffer> maxValues = columnIndex.getMaxValues();
                BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
                List<Long> nullCounts = columnIndex.getNullCounts();
                List<Boolean> nullPages = columnIndex.getNullPages();
                long rowNumber = 0;
                ColumnReader columnReader = columnReadStore.getColumnReader(column);
                ByteBuffer prevMinValue = null;
                ByteBuffer prevMaxValue = null;
                for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
                    boolean isNullPage = nullPages.get(pageNumber);
                    ByteBuffer minValue = minValues.get(pageNumber);
                    ByteBuffer maxValue = maxValues.get(pageNumber);
                    PageValidator pageValidator = new PageValidator(column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage);
                    if (!isNullPage) {
                        prevMinValue = minValue;
                        prevMaxValue = maxValue;
                    }
                    long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
                    while (rowNumber <= lastRowNumberInPage) {
                        pageValidator.validateValuesBelongingToRow();
                        ++rowNumber;
                    }
                    pageValidator.finishPage();
                }
            }
            rowGroup = reader.readNextRowGroup();
            rowGroupNumber++;
        }
    }
    return violations;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) ArrayList(java.util.ArrayList) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) PageReadStore(org.apache.parquet.column.page.PageReadStore) BoundaryOrder(org.apache.parquet.internal.column.columnindex.BoundaryOrder) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ByteBuffer(java.nio.ByteBuffer) ColumnReader(org.apache.parquet.column.ColumnReader) ColumnReadStore(org.apache.parquet.column.ColumnReadStore) DummyRecordConverter(org.apache.parquet.example.DummyRecordConverter)

Example 8 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project presto by prestodb.

the class ParquetReader method readPrimitive.

private ColumnChunk readPrimitive(PrimitiveField field) throws IOException {
    ColumnDescriptor columnDescriptor = field.getDescriptor();
    int fieldId = field.getId();
    ColumnReader columnReader = columnReaders[fieldId];
    if (!columnReader.isInitialized()) {
        validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
        ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
        long startingPosition = metadata.getStartingPos();
        int totalSize = toIntExact(metadata.getTotalSize());
        if (shouldUseColumnIndex(metadata.getPath())) {
            OffsetIndex offsetIndex = blockIndexStores.get(currentBlock).getOffsetIndex(metadata.getPath());
            OffsetIndex filteredOffsetIndex = ColumnIndexFilterUtils.filterOffsetIndex(offsetIndex, currentGroupRowRanges, blocks.get(currentBlock).getRowCount());
            List<OffsetRange> offsetRanges = ColumnIndexFilterUtils.calculateOffsetRanges(filteredOffsetIndex, metadata, offsetIndex.getOffset(0), startingPosition);
            List<OffsetRange> consecutiveRanges = concatRanges(offsetRanges);
            List<ByteBuffer> buffers = allocateBlocks(consecutiveRanges);
            for (int i = 0; i < consecutiveRanges.size(); i++) {
                ByteBuffer buffer = buffers.get(i);
                dataSource.readFully(startingPosition + consecutiveRanges.get(i).getOffset(), buffer.array());
            }
            PageReader pageReader = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex);
            columnReader.init(pageReader, field, currentGroupRowRanges);
            if (enableVerification) {
                ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
                PageReader pageReaderVerification = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex);
                verificationColumnReader.init(pageReaderVerification, field, currentGroupRowRanges);
            }
        } else {
            byte[] buffer = allocateBlock(totalSize);
            dataSource.readFully(startingPosition, buffer);
            PageReader pageReader = createPageReader(buffer, totalSize, metadata, columnDescriptor);
            columnReader.init(pageReader, field, null);
            if (enableVerification) {
                ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
                PageReader pageReaderVerification = createPageReader(buffer, totalSize, metadata, columnDescriptor);
                verificationColumnReader.init(pageReaderVerification, field, null);
            }
        }
    }
    ColumnChunk columnChunk = columnReader.readNext();
    columnChunk = typeCoercion(columnChunk, field.getDescriptor().getPrimitiveType().getPrimitiveTypeName(), field.getType());
    if (enableVerification) {
        ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
        ColumnChunk expected = verificationColumnReader.readNext();
        ParquetResultVerifierUtils.verifyColumnChunks(columnChunk, expected, columnDescriptor.getPath().length > 1, field, dataSource.getId());
    }
    // update max size per primitive column chunk
    long bytesPerCell = columnChunk.getBlock().getSizeInBytes() / batchSize;
    if (maxBytesPerCell[fieldId] < bytesPerCell) {
        // update batch size
        maxCombinedBytesPerRow = maxCombinedBytesPerRow - maxBytesPerCell[fieldId] + bytesPerCell;
        maxBatchSize = toIntExact(min(maxBatchSize, max(1, maxReadBlockBytes / maxCombinedBytesPerRow)));
        maxBytesPerCell[fieldId] = bytesPerCell;
    }
    return columnChunk;
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ByteBuffer(java.nio.ByteBuffer) OffsetRange(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils.OffsetRange) ColumnReader(com.facebook.presto.parquet.ColumnReader) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 9 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project drill by apache.

the class ParquetFileWriter method serializeOffsetIndexes.

private static void serializeOffsetIndexes(List<List<OffsetIndex>> offsetIndexes, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
    LOG.debug("{}: offset indexes", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
        BlockMetaData block = blocks.get(bIndex);
        List<ColumnChunkMetaData> columns = block.getColumns();
        List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
        for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
            OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
            if (offsetIndex == null) {
                continue;
            }
            ColumnChunkMetaData column = columns.get(cIndex);
            BlockCipher.Encryptor offsetIndexEncryptor = null;
            byte[] offsetIndexAAD = null;
            if (null != fileEncryptor) {
                InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
                if (columnEncryptionSetup.isEncrypted()) {
                    offsetIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
                    offsetIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.OffsetIndex, block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
                }
            }
            long offset = out.getPos();
            Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out, offsetIndexEncryptor, offsetIndexAAD);
            column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BlockCipher(org.apache.parquet.format.BlockCipher) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 10 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class ParquetFileReader method internalReadFilteredRowGroup.

private ColumnChunkPageReadStore internalReadFilteredRowGroup(BlockMetaData block, RowRanges rowRanges, ColumnIndexStore ciStore) throws IOException {
    ColumnChunkPageReadStore rowGroup = new ColumnChunkPageReadStore(rowRanges);
    // prepare the list of consecutive parts to read them in one scan
    ChunkListBuilder builder = new ChunkListBuilder(block.getRowCount());
    List<ConsecutivePartList> allParts = new ArrayList<>();
    ConsecutivePartList currentParts = null;
    for (ColumnChunkMetaData mc : block.getColumns()) {
        ColumnPath pathKey = mc.getPath();
        ColumnDescriptor columnDescriptor = paths.get(pathKey);
        if (columnDescriptor != null) {
            OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());
            OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount());
            for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
                BenchmarkCounter.incrementTotalBytes(range.getLength());
                long startingPos = range.getOffset();
                // first part or not consecutive => new list
                if (currentParts == null || currentParts.endPos() != startingPos) {
                    currentParts = new ConsecutivePartList(startingPos);
                    allParts.add(currentParts);
                }
                ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos, range.getLength());
                currentParts.addChunk(chunkDescriptor);
                builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
            }
        }
    }
    // actually read all the chunks
    for (ConsecutivePartList consecutiveChunks : allParts) {
        consecutiveChunks.readAll(f, builder);
    }
    for (Chunk chunk : builder.build()) {
        readChunkPages(chunk, block, rowGroup);
    }
    return rowGroup;
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) OffsetRange(org.apache.parquet.hadoop.ColumnIndexFilterUtils.OffsetRange) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) ColumnIndexFilterUtils.filterOffsetIndex(org.apache.parquet.hadoop.ColumnIndexFilterUtils.filterOffsetIndex)

Aggregations

OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)15 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)9 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)8 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 IOException (java.io.IOException)3 ByteBuffer (java.nio.ByteBuffer)3 Path (org.apache.hadoop.fs.Path)3 BytesInput (org.apache.parquet.bytes.BytesInput)3 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)3 MessageType (org.apache.parquet.schema.MessageType)3 ArrayList (java.util.ArrayList)2 DictionaryPage (org.apache.parquet.column.page.DictionaryPage)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)2 BlockCipher (org.apache.parquet.format.BlockCipher)2 DataPageHeader (org.apache.parquet.format.DataPageHeader)2 DataPageHeaderV2 (org.apache.parquet.format.DataPageHeaderV2)2 DictionaryPageHeader (org.apache.parquet.format.DictionaryPageHeader)2 PageHeader (org.apache.parquet.format.PageHeader)2