Search in sources :

Example 11 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class TestParquetMetadataConverter method testOffsetIndexConversion.

@Test
public void testOffsetIndexConversion() {
    OffsetIndexBuilder builder = OffsetIndexBuilder.getBuilder();
    builder.add(1000, 10000, 0);
    builder.add(22000, 12000, 100);
    OffsetIndex offsetIndex = ParquetMetadataConverter.fromParquetOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(builder.build(100000)));
    assertEquals(2, offsetIndex.getPageCount());
    assertEquals(101000, offsetIndex.getOffset(0));
    assertEquals(10000, offsetIndex.getCompressedPageSize(0));
    assertEquals(0, offsetIndex.getFirstRowIndex(0));
    assertEquals(122000, offsetIndex.getOffset(1));
    assertEquals(12000, offsetIndex.getCompressedPageSize(1));
    assertEquals(100, offsetIndex.getFirstRowIndex(1));
}
Also used : OffsetIndexBuilder(org.apache.parquet.internal.column.columnindex.OffsetIndexBuilder) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) Test(org.junit.Test)

Example 12 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class CompressionConveterTest method validColumnIndex.

private void validColumnIndex(String inputFile, String outFile) throws Exception {
    ParquetMetadata inMetaData = ParquetFileReader.readFooter(conf, new Path(inputFile), NO_FILTER);
    ParquetMetadata outMetaData = ParquetFileReader.readFooter(conf, new Path(outFile), NO_FILTER);
    Assert.assertEquals(inMetaData.getBlocks().size(), outMetaData.getBlocks().size());
    try (TransParquetFileReader inReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(inputFile), conf), HadoopReadOptions.builder(conf).build());
        TransParquetFileReader outReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(outFile), conf), HadoopReadOptions.builder(conf).build())) {
        for (int i = 0; i < inMetaData.getBlocks().size(); i++) {
            BlockMetaData inBlockMetaData = inMetaData.getBlocks().get(i);
            BlockMetaData outBlockMetaData = outMetaData.getBlocks().get(i);
            Assert.assertEquals(inBlockMetaData.getColumns().size(), outBlockMetaData.getColumns().size());
            for (int j = 0; j < inBlockMetaData.getColumns().size(); j++) {
                ColumnChunkMetaData inChunk = inBlockMetaData.getColumns().get(j);
                ColumnIndex inColumnIndex = inReader.readColumnIndex(inChunk);
                OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
                ColumnChunkMetaData outChunk = outBlockMetaData.getColumns().get(j);
                ColumnIndex outColumnIndex = outReader.readColumnIndex(outChunk);
                OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
                if (inColumnIndex != null) {
                    Assert.assertEquals(inColumnIndex.getBoundaryOrder(), outColumnIndex.getBoundaryOrder());
                    Assert.assertEquals(inColumnIndex.getMaxValues(), outColumnIndex.getMaxValues());
                    Assert.assertEquals(inColumnIndex.getMinValues(), outColumnIndex.getMinValues());
                    Assert.assertEquals(inColumnIndex.getNullCounts(), outColumnIndex.getNullCounts());
                }
                if (inOffsetIndex != null) {
                    List<Long> inOffsets = getOffsets(inReader, inChunk);
                    List<Long> outOffsets = getOffsets(outReader, outChunk);
                    Assert.assertEquals(inOffsets.size(), outOffsets.size());
                    Assert.assertEquals(inOffsets.size(), inOffsetIndex.getPageCount());
                    Assert.assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
                    for (int k = 0; k < inOffsetIndex.getPageCount(); k++) {
                        Assert.assertEquals(inOffsetIndex.getFirstRowIndex(k), outOffsetIndex.getFirstRowIndex(k));
                        Assert.assertEquals(inOffsetIndex.getLastRowIndex(k, inChunk.getValueCount()), outOffsetIndex.getLastRowIndex(k, outChunk.getValueCount()));
                        Assert.assertEquals(inOffsetIndex.getOffset(k), (long) inOffsets.get(k));
                        Assert.assertEquals(outOffsetIndex.getOffset(k), (long) outOffsets.get(k));
                    }
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 13 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class TestParquetFileWriter method testColumnIndexWriteRead.

@Test
public void testColumnIndexWriteRead() throws Exception {
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
    w.start();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(C1, 5, CODEC);
    long c1p1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), statsC1(null, Binary.fromString("aaa")), 1, BIT_PACKED, BIT_PACKED, PLAIN);
    long c1p2Starts = w.getPos();
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), statsC1(Binary.fromString("bbb"), Binary.fromString("ccc")), 3, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2p1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), statsC2(117l, 100l), 1, BIT_PACKED, BIT_PACKED, PLAIN);
    long c2p2Starts = w.getPos();
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), statsC2(null, null, null), 2, BIT_PACKED, BIT_PACKED, PLAIN);
    long c2p3Starts = w.getPos();
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), statsC2(0l), 1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), // Creating huge stats so the column index will reach the limit and won't be written
    statsC1(Binary.fromConstantByteArray(new byte[(int) MAX_STATS_SIZE]), Binary.fromConstantByteArray(new byte[1])), 4, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
    try (ParquetFileReader reader = new ParquetFileReader(HadoopInputFile.fromPath(path, configuration), ParquetReadOptions.builder().build())) {
        ParquetMetadata footer = reader.getFooter();
        assertEquals(3, footer.getBlocks().size());
        BlockMetaData blockMeta = footer.getBlocks().get(1);
        assertEquals(2, blockMeta.getColumns().size());
        ColumnIndex columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(0));
        assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
        assertTrue(Arrays.asList(1l, 0l).equals(columnIndex.getNullCounts()));
        assertTrue(Arrays.asList(false, false).equals(columnIndex.getNullPages()));
        List<ByteBuffer> minValues = columnIndex.getMinValues();
        assertEquals(2, minValues.size());
        List<ByteBuffer> maxValues = columnIndex.getMaxValues();
        assertEquals(2, maxValues.size());
        assertEquals("aaa", new String(minValues.get(0).array(), StandardCharsets.UTF_8));
        assertEquals("aaa", new String(maxValues.get(0).array(), StandardCharsets.UTF_8));
        assertEquals("bbb", new String(minValues.get(1).array(), StandardCharsets.UTF_8));
        assertEquals("ccc", new String(maxValues.get(1).array(), StandardCharsets.UTF_8));
        columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(1));
        assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
        assertTrue(Arrays.asList(0l, 3l, 0l).equals(columnIndex.getNullCounts()));
        assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages()));
        minValues = columnIndex.getMinValues();
        assertEquals(3, minValues.size());
        maxValues = columnIndex.getMaxValues();
        assertEquals(3, maxValues.size());
        assertEquals(100, BytesUtils.bytesToLong(minValues.get(0).array()));
        assertEquals(117, BytesUtils.bytesToLong(maxValues.get(0).array()));
        assertEquals(0, minValues.get(1).array().length);
        assertEquals(0, maxValues.get(1).array().length);
        assertEquals(0, BytesUtils.bytesToLong(minValues.get(2).array()));
        assertEquals(0, BytesUtils.bytesToLong(maxValues.get(2).array()));
        OffsetIndex offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(0));
        assertEquals(2, offsetIndex.getPageCount());
        assertEquals(c1p1Starts, offsetIndex.getOffset(0));
        assertEquals(c1p2Starts, offsetIndex.getOffset(1));
        assertEquals(c1p2Starts - c1p1Starts, offsetIndex.getCompressedPageSize(0));
        assertEquals(c1Ends - c1p2Starts, offsetIndex.getCompressedPageSize(1));
        assertEquals(0, offsetIndex.getFirstRowIndex(0));
        assertEquals(1, offsetIndex.getFirstRowIndex(1));
        offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(1));
        assertEquals(3, offsetIndex.getPageCount());
        assertEquals(c2p1Starts, offsetIndex.getOffset(0));
        assertEquals(c2p2Starts, offsetIndex.getOffset(1));
        assertEquals(c2p3Starts, offsetIndex.getOffset(2));
        assertEquals(c2p2Starts - c2p1Starts, offsetIndex.getCompressedPageSize(0));
        assertEquals(c2p3Starts - c2p2Starts, offsetIndex.getCompressedPageSize(1));
        assertEquals(c2Ends - c2p3Starts, offsetIndex.getCompressedPageSize(2));
        assertEquals(0, offsetIndex.getFirstRowIndex(0));
        assertEquals(1, offsetIndex.getFirstRowIndex(1));
        assertEquals(3, offsetIndex.getFirstRowIndex(2));
        assertNull(reader.readColumnIndex(footer.getBlocks().get(2).getColumns().get(0)));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ByteBuffer(java.nio.ByteBuffer) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) Test(org.junit.Test)

Example 14 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class ShowColumnIndexCommand method run.

@Override
public int run() throws IOException {
    Preconditions.checkArgument(files != null && files.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(files.size() == 1, "Cannot process multiple Parquet files.");
    InputFile in = HadoopInputFile.fromPath(qualifiedPath(files.get(0)), getConf());
    if (!showColumnIndex && !showOffsetIndex) {
        showColumnIndex = true;
        showOffsetIndex = true;
    }
    Set<String> rowGroupIndexSet = new HashSet<>();
    if (rowGroupIndexes != null) {
        rowGroupIndexSet.addAll(rowGroupIndexes);
    }
    try (ParquetFileReader reader = ParquetFileReader.open(in)) {
        boolean firstBlock = true;
        int rowGroupIndex = 0;
        for (BlockMetaData block : reader.getFooter().getBlocks()) {
            if (!rowGroupIndexSet.isEmpty() && !rowGroupIndexSet.contains(Integer.toString(rowGroupIndex))) {
                ++rowGroupIndex;
                continue;
            }
            if (!firstBlock) {
                console.info("");
            }
            firstBlock = false;
            console.info("row-group {}:", rowGroupIndex);
            for (ColumnChunkMetaData column : getColumns(block)) {
                String path = column.getPath().toDotString();
                if (showColumnIndex) {
                    console.info("column index for column {}:", path);
                    ColumnIndex columnIndex = reader.readColumnIndex(column);
                    if (columnIndex == null) {
                        console.info("NONE");
                    } else {
                        console.info(columnIndex.toString());
                    }
                }
                if (showOffsetIndex) {
                    console.info("offset index for column {}:", path);
                    OffsetIndex offsetIndex = reader.readOffsetIndex(column);
                    if (offsetIndex == null) {
                        console.info("NONE");
                    } else {
                        console.info(offsetIndex.toString());
                    }
                }
            }
            ++rowGroupIndex;
        }
    }
    return 0;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) InputFile(org.apache.parquet.io.InputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) HashSet(java.util.HashSet)

Example 15 with OffsetIndex

use of org.apache.parquet.internal.column.columnindex.OffsetIndex in project parquet-mr by apache.

the class ColumnEncryptor method processPages.

private void processPages(TransParquetFileReader reader, ColumnChunkMetaData chunk, ParquetFileWriter writer, String createdBy, int blockId, int columnId, boolean encrypt) throws IOException {
    int pageOrdinal = 0;
    EncryptorRunTime encryptorRunTime = new EncryptorRunTime(writer.getEncryptor(), chunk, blockId, columnId);
    DictionaryPage dictionaryPage = null;
    long readValues = 0;
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);
    reader.setStreamPosition(chunk.getStartingPos());
    long totalChunkValues = chunk.getValueCount();
    while (readValues < totalChunkValues) {
        PageHeader pageHeader = reader.readPageHeader();
        byte[] pageLoad;
        switch(pageHeader.type) {
            case DICTIONARY_PAGE:
                if (dictionaryPage != null) {
                    throw new IOException("has more than one dictionary page in column chunk");
                }
                // No quickUpdatePageAAD needed for dictionary page
                DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
                pageLoad = processPayload(reader, pageHeader.getCompressed_page_size(), encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDictPageAAD(), encrypt);
                writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad), pageHeader.getUncompressed_page_size(), dictPageHeader.getNum_values(), converter.getEncoding(dictPageHeader.getEncoding())), encryptorRunTime.getMetaDataEncryptor(), encryptorRunTime.getDictPageHeaderAAD());
                break;
            case DATA_PAGE:
                if (encrypt) {
                    AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageHeaderAAD(), pageOrdinal);
                    AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageAAD(), pageOrdinal);
                }
                DataPageHeader headerV1 = pageHeader.data_page_header;
                pageLoad = processPayload(reader, pageHeader.getCompressed_page_size(), encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDataPageAAD(), encrypt);
                readValues += headerV1.getNum_values();
                if (offsetIndex != null) {
                    long rowCount = 1 + offsetIndex.getLastRowIndex(pageOrdinal, totalChunkValues) - offsetIndex.getFirstRowIndex(pageOrdinal);
                    writer.writeDataPage(Math.toIntExact(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), converter.fromParquetStatistics(createdBy, headerV1.getStatistics(), chunk.getPrimitiveType()), rowCount, converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()), encryptorRunTime.getMetaDataEncryptor(), encryptorRunTime.getDataPageHeaderAAD());
                } else {
                    writer.writeDataPage(Math.toIntExact(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), converter.fromParquetStatistics(createdBy, headerV1.getStatistics(), chunk.getPrimitiveType()), converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding()), encryptorRunTime.getMetaDataEncryptor(), encryptorRunTime.getDataPageHeaderAAD());
                }
                pageOrdinal++;
                break;
            case DATA_PAGE_V2:
                if (encrypt) {
                    AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageHeaderAAD(), pageOrdinal);
                    AesCipher.quickUpdatePageAAD(encryptorRunTime.getDataPageAAD(), pageOrdinal);
                }
                DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
                int rlLength = headerV2.getRepetition_levels_byte_length();
                BytesInput rlLevels = readBlockAllocate(rlLength, reader);
                int dlLength = headerV2.getDefinition_levels_byte_length();
                BytesInput dlLevels = readBlockAllocate(dlLength, reader);
                int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
                int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
                pageLoad = processPayload(reader, payLoadLength, encryptorRunTime.getDataEncryptor(), encryptorRunTime.getDataPageAAD(), encrypt);
                readValues += headerV2.getNum_values();
                writer.writeDataPageV2(headerV2.getNum_rows(), headerV2.getNum_nulls(), headerV2.getNum_values(), rlLevels, dlLevels, converter.getEncoding(headerV2.getEncoding()), BytesInput.from(pageLoad), rawDataLength, converter.fromParquetStatistics(createdBy, headerV2.getStatistics(), chunk.getPrimitiveType()));
                pageOrdinal++;
                break;
            default:
                break;
        }
    }
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) IOException(java.io.IOException) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) DataPageHeader(org.apache.parquet.format.DataPageHeader) PageHeader(org.apache.parquet.format.PageHeader) DataPageHeader(org.apache.parquet.format.DataPageHeader) DictionaryPageHeader(org.apache.parquet.format.DictionaryPageHeader) DataPageHeaderV2(org.apache.parquet.format.DataPageHeaderV2) DictionaryPage(org.apache.parquet.column.page.DictionaryPage) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Aggregations

OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)15 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)9 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)8 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 IOException (java.io.IOException)3 ByteBuffer (java.nio.ByteBuffer)3 Path (org.apache.hadoop.fs.Path)3 BytesInput (org.apache.parquet.bytes.BytesInput)3 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)3 MessageType (org.apache.parquet.schema.MessageType)3 ArrayList (java.util.ArrayList)2 DictionaryPage (org.apache.parquet.column.page.DictionaryPage)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)2 BlockCipher (org.apache.parquet.format.BlockCipher)2 DataPageHeader (org.apache.parquet.format.DataPageHeader)2 DataPageHeaderV2 (org.apache.parquet.format.DataPageHeaderV2)2 DictionaryPageHeader (org.apache.parquet.format.DictionaryPageHeader)2 PageHeader (org.apache.parquet.format.PageHeader)2