Search in sources :

Example 11 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.

the class ParquetFileWriter method serializeColumnIndexes.

private static void serializeColumnIndexes(List<List<ColumnIndex>> columnIndexes, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
    LOG.debug("{}: column indexes", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
        BlockMetaData block = blocks.get(bIndex);
        List<ColumnChunkMetaData> columns = block.getColumns();
        List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
        for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
            ColumnChunkMetaData column = columns.get(cIndex);
            org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter.toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
            if (columnIndex == null) {
                continue;
            }
            BlockCipher.Encryptor columnIndexEncryptor = null;
            byte[] columnIndexAAD = null;
            if (null != fileEncryptor) {
                InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
                if (columnEncryptionSetup.isEncrypted()) {
                    columnIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
                    columnIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.ColumnIndex, block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
                }
            }
            long offset = out.getPos();
            Util.writeColumnIndex(columnIndex, out, columnIndexEncryptor, columnIndexAAD);
            column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BlockCipher(org.apache.parquet.format.BlockCipher) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference)

Example 12 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.

the class ColumnIndexValidator method checkContractViolations.

public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
    List<ContractViolation> violations = new ArrayList<>();
    try (ParquetFileReader reader = ParquetFileReader.open(file)) {
        FileMetaData meta = reader.getFooter().getFileMetaData();
        MessageType schema = meta.getSchema();
        List<ColumnDescriptor> columns = schema.getColumns();
        List<BlockMetaData> blocks = reader.getFooter().getBlocks();
        int rowGroupNumber = 0;
        PageReadStore rowGroup = reader.readNextRowGroup();
        while (rowGroup != null) {
            ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup, new DummyRecordConverter(schema).getRootConverter(), schema, null);
            List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
            assert (columnChunks.size() == columns.size());
            for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
                ColumnDescriptor column = columns.get(columnNumber);
                ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
                ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
                if (columnIndex == null) {
                    continue;
                }
                ColumnPath columnPath = columnChunk.getPath();
                OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
                List<ByteBuffer> minValues = columnIndex.getMinValues();
                List<ByteBuffer> maxValues = columnIndex.getMaxValues();
                BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
                List<Long> nullCounts = columnIndex.getNullCounts();
                List<Boolean> nullPages = columnIndex.getNullPages();
                long rowNumber = 0;
                ColumnReader columnReader = columnReadStore.getColumnReader(column);
                ByteBuffer prevMinValue = null;
                ByteBuffer prevMaxValue = null;
                for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
                    boolean isNullPage = nullPages.get(pageNumber);
                    ByteBuffer minValue = minValues.get(pageNumber);
                    ByteBuffer maxValue = maxValues.get(pageNumber);
                    PageValidator pageValidator = new PageValidator(column.getPrimitiveType(), rowGroupNumber, columnNumber, columnPath, pageNumber, violations, columnReader, minValue, maxValue, prevMinValue, prevMaxValue, boundaryOrder, nullCounts.get(pageNumber), isNullPage);
                    if (!isNullPage) {
                        prevMinValue = minValue;
                        prevMaxValue = maxValue;
                    }
                    long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
                    while (rowNumber <= lastRowNumberInPage) {
                        pageValidator.validateValuesBelongingToRow();
                        ++rowNumber;
                    }
                    pageValidator.finishPage();
                }
            }
            rowGroup = reader.readNextRowGroup();
            rowGroupNumber++;
        }
    }
    return violations;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnReadStoreImpl(org.apache.parquet.column.impl.ColumnReadStoreImpl) ArrayList(java.util.ArrayList) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) PageReadStore(org.apache.parquet.column.page.PageReadStore) BoundaryOrder(org.apache.parquet.internal.column.columnindex.BoundaryOrder) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) MessageType(org.apache.parquet.schema.MessageType) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) ByteBuffer(java.nio.ByteBuffer) ColumnReader(org.apache.parquet.column.ColumnReader) ColumnReadStore(org.apache.parquet.column.ColumnReadStore) DummyRecordConverter(org.apache.parquet.example.DummyRecordConverter)

Example 13 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project presto by prestodb.

the class TestColumnIndexBuilder method testStaticBuildDouble.

@Test
public void testStaticBuildDouble() {
    ColumnIndex columnIndex = ColumnIndexBuilder.build(Types.required(DOUBLE).named("test_double"), BoundaryOrder.UNORDERED, asList(false, false, false, false, false, false), asList(0L, 1L, 2L, 3L, 4L, 5L), toBBList(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0), toBBList(1.0, 2.0, 3.0, 4.0, 5.0, 6.0));
    assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 4, 5);
    assertCorrectNullPages(columnIndex, false, false, false, false, false, false);
    assertCorrectValues(columnIndex.getMaxValues(), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
    assertCorrectValues(columnIndex.getMinValues(), -1.0, -2.0, -3.0, -4.0, -5.0, -6.0);
}
Also used : ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) Test(org.testng.annotations.Test)

Example 14 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project presto by prestodb.

the class TestColumnIndexBuilder method testBuildFloat.

@Test
public void testBuildFloat() {
    PrimitiveType type = Types.required(FLOAT).named("test_float");
    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    // assertThat(builder, instanceOf(FloatColumnIndexBuilder.class));
    assertNull(builder.build());
    Operators.FloatColumn col = floatColumn("test_col");
    StatsBuilder sb = new StatsBuilder();
    builder.add(sb.stats(type, -4.2f, -4.1f));
    builder.add(sb.stats(type, -11.7f, 7.0f, null));
    builder.add(sb.stats(type, 2.2f, 2.2f, null, null));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, 1.9f, 2.32f));
    builder.add(sb.stats(type, -21.0f, 8.1f));
    assertEquals(6, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    ColumnIndex columnIndex = builder.build();
    assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0);
    assertCorrectNullPages(columnIndex, false, false, false, true, false, false);
    assertCorrectValues(columnIndex.getMaxValues(), -4.1f, 7.0f, 2.2f, null, 2.32f, 8.1f);
    assertCorrectValues(columnIndex.getMinValues(), -4.2f, -11.7f, 2.2f, null, 1.9f, -21.0f);
    assertCorrectFiltering(columnIndex, eq(col, 0.0f), 1, 5);
    assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3);
    assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5);
    assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5);
    assertCorrectFiltering(columnIndex, gt(col, 2.2f), 1, 4, 5);
    assertCorrectFiltering(columnIndex, gtEq(col, 2.2f), 1, 2, 4, 5);
    assertCorrectFiltering(columnIndex, lt(col, 0.0f), 0, 1, 5);
    assertCorrectFiltering(columnIndex, ltEq(col, 1.9f), 0, 1, 4, 5);
    assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 4, 5);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    sb = new StatsBuilder();
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, -532.3f, -345.2f, null, null));
    builder.add(sb.stats(type, -300.6f, -234.7f, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, -234.6f, 2.99999f));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, 3.0f, 42.83f));
    builder.add(sb.stats(type, null, null));
    assertEquals(9, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    columnIndex = builder.build();
    assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2);
    assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true);
    assertCorrectValues(columnIndex.getMaxValues(), null, -345.2f, -234.7f, null, null, 2.99999f, null, 42.83f, null);
    assertCorrectValues(columnIndex.getMinValues(), null, -532.3f, -300.6f, null, null, -234.6f, null, 3.0f, null);
    assertCorrectFiltering(columnIndex, eq(col, 0.0f), 5);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8);
    assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5, 6, 7, 8);
    assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7);
    assertCorrectFiltering(columnIndex, gt(col, 2.2f), 5, 7);
    assertCorrectFiltering(columnIndex, gtEq(col, -234.7f), 2, 5, 7);
    assertCorrectFiltering(columnIndex, lt(col, -234.6f), 1, 2);
    assertCorrectFiltering(columnIndex, ltEq(col, -234.6f), 1, 2, 5);
    assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 2, 5, 7);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    sb = new StatsBuilder();
    builder.add(sb.stats(type, null, null, null, null, null));
    builder.add(sb.stats(type, 532.3f, 345.2f));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, 234.7f, 234.6f, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, 234.6f, -2.99999f));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, -3.0f, -42.83f));
    assertEquals(9, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    columnIndex = builder.build();
    assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0);
    assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false);
    assertCorrectValues(columnIndex.getMaxValues(), null, 532.3f, null, 234.7f, null, 234.6f, null, null, -3.0f);
    assertCorrectValues(columnIndex.getMinValues(), null, 345.2f, null, 234.6f, null, -2.99999f, null, null, -42.83f);
    assertCorrectFiltering(columnIndex, eq(col, 234.65f), 3);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7);
    assertCorrectFiltering(columnIndex, notEq(col, 2.2f), 0, 1, 2, 3, 4, 5, 6, 7, 8);
    assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8);
    assertCorrectFiltering(columnIndex, gt(col, 2.2f), 1, 3, 5);
    assertCorrectFiltering(columnIndex, gtEq(col, 2.2f), 1, 3, 5);
    assertCorrectFiltering(columnIndex, lt(col, 0.0f), 5, 8);
    assertCorrectFiltering(columnIndex, ltEq(col, 0.0f), 5, 8);
    assertCorrectFiltering(columnIndex, userDefined(col, FloatIsInteger.class), 1, 5, 8);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, FloatIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
}
Also used : Operators(org.apache.parquet.filter2.predicate.Operators) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Example 15 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project presto by prestodb.

the class TestColumnIndexBuilder method testBuildDoubleZeroNaN.

@Test
public void testBuildDoubleZeroNaN() {
    PrimitiveType type = Types.required(DOUBLE).named("test_double");
    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    StatsBuilder sb = new StatsBuilder();
    builder.add(sb.stats(type, -1.0, -0.0));
    builder.add(sb.stats(type, 0.0, 1.0));
    builder.add(sb.stats(type, 1.0, 100.0));
    ColumnIndex columnIndex = builder.build();
    assertCorrectValues(columnIndex.getMinValues(), -1.0, -0.0, 1.0);
    assertCorrectValues(columnIndex.getMaxValues(), 0.0, 1.0, 100.0);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    builder.add(sb.stats(type, -1.0, -0.0));
    builder.add(sb.stats(type, 0.0, Double.NaN));
    builder.add(sb.stats(type, 1.0, 100.0));
    assertNull(builder.build());
}
Also used : ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Aggregations

ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)28 Test (org.testng.annotations.Test)17 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 PrimitiveType (org.apache.parquet.schema.PrimitiveType)11 Operators (org.apache.parquet.filter2.predicate.Operators)9 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)6 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 Path (org.apache.hadoop.fs.Path)3 MessageType (org.apache.parquet.schema.MessageType)3 IOException (java.io.IOException)2 ByteBuffer (java.nio.ByteBuffer)2 BytesInput (org.apache.parquet.bytes.BytesInput)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)2 BlockCipher (org.apache.parquet.format.BlockCipher)2 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)2 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)2 Test (org.junit.Test)2