Search in sources :

Example 26 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.

the class TestParquetFileWriter method testColumnIndexWriteRead.

@Test
public void testColumnIndexWriteRead() throws Exception {
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    ParquetFileWriter w = new ParquetFileWriter(configuration, SCHEMA, path);
    w.start();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(C1, 5, CODEC);
    long c1p1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), statsC1(null, Binary.fromString("aaa")), 1, BIT_PACKED, BIT_PACKED, PLAIN);
    long c1p2Starts = w.getPos();
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), statsC1(Binary.fromString("bbb"), Binary.fromString("ccc")), 3, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c1Ends = w.getPos();
    w.startColumn(C2, 6, CODEC);
    long c2p1Starts = w.getPos();
    w.writeDataPage(2, 4, BytesInput.from(BYTES2), statsC2(117l, 100l), 1, BIT_PACKED, BIT_PACKED, PLAIN);
    long c2p2Starts = w.getPos();
    w.writeDataPage(3, 4, BytesInput.from(BYTES2), statsC2(null, null, null), 2, BIT_PACKED, BIT_PACKED, PLAIN);
    long c2p3Starts = w.getPos();
    w.writeDataPage(1, 4, BytesInput.from(BYTES2), statsC2(0l), 1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    long c2Ends = w.getPos();
    w.endBlock();
    w.startBlock(4);
    w.startColumn(C1, 7, CODEC);
    w.writeDataPage(7, 4, BytesInput.from(BYTES3), // Creating huge stats so the column index will reach the limit and won't be written
    statsC1(Binary.fromConstantByteArray(new byte[(int) MAX_STATS_SIZE]), Binary.fromConstantByteArray(new byte[1])), 4, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.startColumn(C2, 8, CODEC);
    w.writeDataPage(8, 4, BytesInput.from(BYTES4), EMPTY_STATS, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    w.endBlock();
    w.end(new HashMap<String, String>());
    try (ParquetFileReader reader = new ParquetFileReader(HadoopInputFile.fromPath(path, configuration), ParquetReadOptions.builder().build())) {
        ParquetMetadata footer = reader.getFooter();
        assertEquals(3, footer.getBlocks().size());
        BlockMetaData blockMeta = footer.getBlocks().get(1);
        assertEquals(2, blockMeta.getColumns().size());
        ColumnIndex columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(0));
        assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
        assertTrue(Arrays.asList(1l, 0l).equals(columnIndex.getNullCounts()));
        assertTrue(Arrays.asList(false, false).equals(columnIndex.getNullPages()));
        List<ByteBuffer> minValues = columnIndex.getMinValues();
        assertEquals(2, minValues.size());
        List<ByteBuffer> maxValues = columnIndex.getMaxValues();
        assertEquals(2, maxValues.size());
        assertEquals("aaa", new String(minValues.get(0).array(), StandardCharsets.UTF_8));
        assertEquals("aaa", new String(maxValues.get(0).array(), StandardCharsets.UTF_8));
        assertEquals("bbb", new String(minValues.get(1).array(), StandardCharsets.UTF_8));
        assertEquals("ccc", new String(maxValues.get(1).array(), StandardCharsets.UTF_8));
        columnIndex = reader.readColumnIndex(blockMeta.getColumns().get(1));
        assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
        assertTrue(Arrays.asList(0l, 3l, 0l).equals(columnIndex.getNullCounts()));
        assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages()));
        minValues = columnIndex.getMinValues();
        assertEquals(3, minValues.size());
        maxValues = columnIndex.getMaxValues();
        assertEquals(3, maxValues.size());
        assertEquals(100, BytesUtils.bytesToLong(minValues.get(0).array()));
        assertEquals(117, BytesUtils.bytesToLong(maxValues.get(0).array()));
        assertEquals(0, minValues.get(1).array().length);
        assertEquals(0, maxValues.get(1).array().length);
        assertEquals(0, BytesUtils.bytesToLong(minValues.get(2).array()));
        assertEquals(0, BytesUtils.bytesToLong(maxValues.get(2).array()));
        OffsetIndex offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(0));
        assertEquals(2, offsetIndex.getPageCount());
        assertEquals(c1p1Starts, offsetIndex.getOffset(0));
        assertEquals(c1p2Starts, offsetIndex.getOffset(1));
        assertEquals(c1p2Starts - c1p1Starts, offsetIndex.getCompressedPageSize(0));
        assertEquals(c1Ends - c1p2Starts, offsetIndex.getCompressedPageSize(1));
        assertEquals(0, offsetIndex.getFirstRowIndex(0));
        assertEquals(1, offsetIndex.getFirstRowIndex(1));
        offsetIndex = reader.readOffsetIndex(blockMeta.getColumns().get(1));
        assertEquals(3, offsetIndex.getPageCount());
        assertEquals(c2p1Starts, offsetIndex.getOffset(0));
        assertEquals(c2p2Starts, offsetIndex.getOffset(1));
        assertEquals(c2p3Starts, offsetIndex.getOffset(2));
        assertEquals(c2p2Starts - c2p1Starts, offsetIndex.getCompressedPageSize(0));
        assertEquals(c2p3Starts - c2p2Starts, offsetIndex.getCompressedPageSize(1));
        assertEquals(c2Ends - c2p3Starts, offsetIndex.getCompressedPageSize(2));
        assertEquals(0, offsetIndex.getFirstRowIndex(0));
        assertEquals(1, offsetIndex.getFirstRowIndex(1));
        assertEquals(3, offsetIndex.getFirstRowIndex(2));
        assertNull(reader.readColumnIndex(footer.getBlocks().get(2).getColumns().get(0)));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ByteBuffer(java.nio.ByteBuffer) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) Test(org.junit.Test)

Example 27 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.

the class ShowColumnIndexCommand method run.

@Override
public int run() throws IOException {
    Preconditions.checkArgument(files != null && files.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(files.size() == 1, "Cannot process multiple Parquet files.");
    InputFile in = HadoopInputFile.fromPath(qualifiedPath(files.get(0)), getConf());
    if (!showColumnIndex && !showOffsetIndex) {
        showColumnIndex = true;
        showOffsetIndex = true;
    }
    Set<String> rowGroupIndexSet = new HashSet<>();
    if (rowGroupIndexes != null) {
        rowGroupIndexSet.addAll(rowGroupIndexes);
    }
    try (ParquetFileReader reader = ParquetFileReader.open(in)) {
        boolean firstBlock = true;
        int rowGroupIndex = 0;
        for (BlockMetaData block : reader.getFooter().getBlocks()) {
            if (!rowGroupIndexSet.isEmpty() && !rowGroupIndexSet.contains(Integer.toString(rowGroupIndex))) {
                ++rowGroupIndex;
                continue;
            }
            if (!firstBlock) {
                console.info("");
            }
            firstBlock = false;
            console.info("row-group {}:", rowGroupIndex);
            for (ColumnChunkMetaData column : getColumns(block)) {
                String path = column.getPath().toDotString();
                if (showColumnIndex) {
                    console.info("column index for column {}:", path);
                    ColumnIndex columnIndex = reader.readColumnIndex(column);
                    if (columnIndex == null) {
                        console.info("NONE");
                    } else {
                        console.info(columnIndex.toString());
                    }
                }
                if (showOffsetIndex) {
                    console.info("offset index for column {}:", path);
                    OffsetIndex offsetIndex = reader.readOffsetIndex(column);
                    if (offsetIndex == null) {
                        console.info("NONE");
                    } else {
                        console.info(offsetIndex.toString());
                    }
                }
            }
            ++rowGroupIndex;
        }
    }
    return 0;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) InputFile(org.apache.parquet.io.InputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) HashSet(java.util.HashSet)

Example 28 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.

the class TestParquetMetadataConverter method testColumnIndexConversion.

@Test
public void testColumnIndexConversion() {
    PrimitiveType type = Types.required(PrimitiveTypeName.INT64).named("test_int64");
    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    Statistics<?> stats = Statistics.createStats(type);
    stats.incrementNumNulls(16);
    stats.updateStats(-100l);
    stats.updateStats(100l);
    builder.add(stats);
    stats = Statistics.createStats(type);
    stats.incrementNumNulls(111);
    builder.add(stats);
    stats = Statistics.createStats(type);
    stats.updateStats(200l);
    stats.updateStats(500l);
    builder.add(stats);
    org.apache.parquet.format.ColumnIndex parquetColumnIndex = ParquetMetadataConverter.toParquetColumnIndex(type, builder.build());
    ColumnIndex columnIndex = ParquetMetadataConverter.fromParquetColumnIndex(type, parquetColumnIndex);
    assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
    assertTrue(Arrays.asList(false, true, false).equals(columnIndex.getNullPages()));
    assertTrue(Arrays.asList(16l, 111l, 0l).equals(columnIndex.getNullCounts()));
    assertTrue(Arrays.asList(ByteBuffer.wrap(BytesUtils.longToBytes(-100l)), ByteBuffer.allocate(0), ByteBuffer.wrap(BytesUtils.longToBytes(200l))).equals(columnIndex.getMinValues()));
    assertTrue(Arrays.asList(ByteBuffer.wrap(BytesUtils.longToBytes(100l)), ByteBuffer.allocate(0), ByteBuffer.wrap(BytesUtils.longToBytes(500l))).equals(columnIndex.getMaxValues()));
    assertNull("Should handle null column index", ParquetMetadataConverter.toParquetColumnIndex(Types.required(PrimitiveTypeName.INT32).named("test_int32"), null));
    assertNull("Should ignore unsupported types", ParquetMetadataConverter.toParquetColumnIndex(Types.required(PrimitiveTypeName.INT96).named("test_int96"), columnIndex));
    assertNull("Should ignore unsupported types", ParquetMetadataConverter.fromParquetColumnIndex(Types.required(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(12).as(OriginalType.INTERVAL).named("test_interval"), parquetColumnIndex));
}
Also used : ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.junit.Test)

Aggregations

ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)28 Test (org.testng.annotations.Test)17 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 PrimitiveType (org.apache.parquet.schema.PrimitiveType)11 Operators (org.apache.parquet.filter2.predicate.Operators)9 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)6 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 Path (org.apache.hadoop.fs.Path)3 MessageType (org.apache.parquet.schema.MessageType)3 IOException (java.io.IOException)2 ByteBuffer (java.nio.ByteBuffer)2 BytesInput (org.apache.parquet.bytes.BytesInput)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)2 BlockCipher (org.apache.parquet.format.BlockCipher)2 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)2 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)2 Test (org.junit.Test)2