Search in sources :

Example 1 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class ColumnIndexFilter method applyPredicate.

private RowRanges applyPredicate(Column<?> column, Function<ColumnIndex, PrimitiveIterator.OfInt> func, RowRanges rangesForMissingColumns) {
    ColumnPath columnPath = column.getColumnPath();
    if (!columns.contains(columnPath)) {
        return rangesForMissingColumns;
    }
    OffsetIndex oi = columnIndexStore.getOffsetIndex(columnPath);
    ColumnIndex ci = columnIndexStore.getColumnIndex(columnPath);
    if (ci == null) {
        LOGGER.info("No column index for column {} is available; Unable to filter on this column", columnPath);
        return allRows();
    }
    return RowRanges.create(rowCount, func.apply(ci), oi);
}
Also used : ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 2 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class ColumnMasker method processChunk.

private void processChunk(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, ColumnReadStoreImpl crStore, TransParquetFileReader reader, ParquetFileWriter writer, MessageType schema, Set<ColumnPath> paths, MaskMode maskMode) throws IOException {
    reader.setStreamPosition(chunk.getStartingPos());
    if (paths.contains(chunk.getPath())) {
        if (maskMode.equals(MaskMode.NULLIFY)) {
            Type.Repetition repetition = descriptor.getPrimitiveType().getRepetition();
            if (repetition.equals(Type.Repetition.REQUIRED)) {
                throw new IOException("Required column [" + descriptor.getPrimitiveType().getName() + "] cannot be nullified");
            }
            nullifyColumn(descriptor, chunk, crStore, writer, schema);
        } else {
            throw new UnsupportedOperationException("Only nullify is supported for now");
        }
    } else {
        BloomFilter bloomFilter = reader.readBloomFilter(chunk);
        ColumnIndex columnIndex = reader.readColumnIndex(chunk);
        OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);
        writer.appendColumnChunk(descriptor, reader.getStream(), chunk, bloomFilter, columnIndex, offsetIndex);
    }
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) IOException(java.io.IOException) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 3 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class CompressionConverter method processChunk.

private void processChunk(TransParquetFileReader reader, ParquetFileWriter writer, ColumnChunkMetaData chunk, String createdBy, CompressionCodecName codecName) throws IOException {
    CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0);
    CompressionCodecFactory.BytesInputDecompressor decompressor = codecFactory.getDecompressor(chunk.getCodec());
    CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(codecName);
    ColumnIndex columnIndex = reader.readColumnIndex(chunk);
    OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);
    reader.setStreamPosition(chunk.getStartingPos());
    DictionaryPage dictionaryPage = null;
    long readValues = 0;
    Statistics statistics = null;
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    int pageIndex = 0;
    long totalChunkValues = chunk.getValueCount();
    while (readValues < totalChunkValues) {
        PageHeader pageHeader = reader.readPageHeader();
        int compressedPageSize = pageHeader.getCompressed_page_size();
        byte[] pageLoad;
        switch(pageHeader.type) {
            case DICTIONARY_PAGE:
                if (dictionaryPage != null) {
                    throw new IOException("has more than one dictionary page in column chunk");
                }
                DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
                pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
                writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad), pageHeader.getUncompressed_page_size(), dictPageHeader.getNum_values(), converter.getEncoding(dictPageHeader.getEncoding())));
                break;
            case DATA_PAGE:
                DataPageHeader headerV1 = pageHeader.data_page_header;
                pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
                statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV1.getStatistics(), columnIndex, pageIndex, converter);
                readValues += headerV1.getNum_values();
                if (offsetIndex != null) {
                    long rowCount = 1 + offsetIndex.getLastRowIndex(pageIndex, totalChunkValues) - offsetIndex.getFirstRowIndex(pageIndex);
                    writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), statistics, toIntWithCheck(rowCount), converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()));
                } else {
                    writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), statistics, converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()));
                }
                pageIndex++;
                break;
            case DATA_PAGE_V2:
                DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
                int rlLength = headerV2.getRepetition_levels_byte_length();
                BytesInput rlLevels = readBlockAllocate(rlLength, reader);
                int dlLength = headerV2.getDefinition_levels_byte_length();
                BytesInput dlLevels = readBlockAllocate(dlLength, reader);
                int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
                int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
                pageLoad = translatePageLoad(reader, headerV2.is_compressed, compressor, decompressor, payLoadLength, rawDataLength);
                statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV2.getStatistics(), columnIndex, pageIndex, converter);
                readValues += headerV2.getNum_values();
                writer.writeDataPageV2(headerV2.getNum_rows(), headerV2.getNum_nulls(), headerV2.getNum_values(), rlLevels, dlLevels, converter.getEncoding(headerV2.getEncoding()), BytesInput.from(pageLoad), rawDataLength, statistics);
                pageIndex++;
                break;
            default:
                LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
                break;
        }
    }
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) IOException(java.io.IOException) Statistics(org.apache.parquet.column.statistics.Statistics) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) DataPageHeader(org.apache.parquet.format.DataPageHeader) PageHeader(org.apache.parquet.format.PageHeader) DataPageHeader(org.apache.parquet.format.DataPageHeader) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) DataPageHeaderV2(org.apache.parquet.format.DataPageHeaderV2) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 4 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class ColumnEncryptorTest method validateColumns.

private void validateColumns(TransParquetFileReader inReader, TransParquetFileReader outReader, List<ColumnChunkMetaData> inColumns, List<ColumnChunkMetaData> outColumns) throws IOException {
    for (int i = 0; i < inColumns.size(); i++) {
        ColumnChunkMetaData inChunk = inColumns.get(i);
        ColumnChunkMetaData outChunk = outColumns.get(i);
        OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
        OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
        assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
        if (outChunk.isEncrypted()) {
            continue;
        }
        validatePages(inReader, outReader, inOffsetIndex, outOffsetIndex);
    }
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 5 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class TestColumnChunkPageWriteStore method test.

@Test
public void test() throws Exception {
    Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
    Path root = file.getParent();
    FileSystem fs = file.getFileSystem(conf);
    if (fs.exists(root)) {
        fs.delete(root, true);
    }
    fs.mkdirs(root);
    MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    Encoding dataEncoding = PLAIN;
    int valueCount = 10;
    int d = 1;
    int r = 2;
    int v = 3;
    BytesInput definitionLevels = BytesInput.fromInt(d);
    BytesInput repetitionLevels = BytesInput.fromInt(r);
    Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")).build();
    BytesInput data = BytesInput.fromInt(v);
    int rowCount = 5;
    int nullCount = 1;
    statistics.incrementNumNulls(nullCount);
    statistics.setMinMaxFromBytes(new byte[] { 0, 1, 2 }, new byte[] { 0, 1, 2, 3 });
    long pageOffset;
    long pageSize;
    {
        OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
        ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
        writer.start();
        writer.startBlock(rowCount);
        pageOffset = outputFile.out().getPos();
        {
            ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE);
            PageWriter pageWriter = store.getPageWriter(col);
            pageWriter.writePageV2(rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics);
            store.flushToFileWriter(writer);
            pageSize = outputFile.out().getPos() - pageOffset;
        }
        writer.endBlock();
        writer.end(new HashMap<String, String>());
    }
    {
        ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        ParquetFileReader reader = new ParquetFileReader(conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
        PageReadStore rowGroup = reader.readNextRowGroup();
        PageReader pageReader = rowGroup.getPageReader(col);
        DataPageV2 page = (DataPageV2) pageReader.readPage();
        assertEquals(rowCount, page.getRowCount());
        assertEquals(nullCount, page.getNullCount());
        assertEquals(valueCount, page.getValueCount());
        assertEquals(d, intValue(page.getDefinitionLevels()));
        assertEquals(r, intValue(page.getRepetitionLevels()));
        assertEquals(dataEncoding, page.getDataEncoding());
        assertEquals(v, intValue(page.getData()));
        // Checking column/offset indexes for the one page
        ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
        ColumnIndex columnIndex = reader.readColumnIndex(column);
        assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
        assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
        assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
        assertFalse(columnIndex.getNullPages().get(0));
        OffsetIndex offsetIndex = reader.readOffsetIndex(column);
        assertEquals(1, offsetIndex.getPageCount());
        assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
        assertEquals(0, offsetIndex.getFirstRowIndex(0));
        assertEquals(pageOffset, offsetIndex.getOffset(0));
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BytesInput(org.apache.parquet.bytes.BytesInput) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Encoding(org.apache.parquet.column.Encoding) DataPageV2(org.apache.parquet.column.page.DataPageV2) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) HeapByteBufferAllocator(org.apache.parquet.bytes.HeapByteBufferAllocator) PageReadStore(org.apache.parquet.column.page.PageReadStore) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Aggregations

OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)15 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)9 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)8 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 IOException (java.io.IOException)3 ByteBuffer (java.nio.ByteBuffer)3 Path (org.apache.hadoop.fs.Path)3 BytesInput (org.apache.parquet.bytes.BytesInput)3 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)3 MessageType (org.apache.parquet.schema.MessageType)3 ArrayList (java.util.ArrayList)2 DictionaryPage (org.apache.parquet.column.page.DictionaryPage)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)2 BlockCipher (org.apache.parquet.format.BlockCipher)2 DataPageHeader (org.apache.parquet.format.DataPageHeader)2 DataPageHeaderV2 (org.apache.parquet.format.DataPageHeaderV2)2 DictionaryPageHeader (org.apache.parquet.format.DictionaryPageHeader)2 PageHeader (org.apache.parquet.format.PageHeader)2