Search in sources :

Example 21 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project presto by prestodb.

the class TestColumnIndexBuilder method testFilterWithoutNullCounts.

@Test
public void testFilterWithoutNullCounts() {
    ColumnIndex columnIndex = ColumnIndexBuilder.build(Types.required(BINARY).as(UTF8).named("test_binary_utf8"), BoundaryOrder.ASCENDING, asList(true, true, false, false, true, false, true, false), null, toBBList(null, null, stringBinary("Beeblebrox"), stringBinary("Dent"), null, stringBinary("Jeltz"), null, stringBinary("Slartibartfast")), toBBList(null, null, stringBinary("Dent"), stringBinary("Dent"), null, stringBinary("Prefect"), null, stringBinary("Slartibartfast")));
    assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
    assertNull(columnIndex.getNullCounts());
    assertCorrectNullPages(columnIndex, true, true, false, false, true, false, true, false);
    assertCorrectValues(columnIndex.getMaxValues(), null, null, stringBinary("Dent"), stringBinary("Dent"), null, stringBinary("Prefect"), null, stringBinary("Slartibartfast"));
    assertCorrectValues(columnIndex.getMinValues(), null, null, stringBinary("Beeblebrox"), stringBinary("Dent"), null, stringBinary("Jeltz"), null, stringBinary("Slartibartfast"));
    Operators.BinaryColumn col = binaryColumn("test_col");
    assertCorrectFiltering(columnIndex, eq(col, stringBinary("Dent")), 2, 3);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 5, 6, 7);
    assertCorrectFiltering(columnIndex, notEq(col, stringBinary("Dent")), 0, 1, 2, 3, 4, 5, 6, 7);
    assertCorrectFiltering(columnIndex, notEq(col, null), 2, 3, 5, 7);
    assertCorrectFiltering(columnIndex, userDefined(col, BinaryDecimalIsNullOrZeroUdp.class), 0, 1, 2, 3, 4, 5, 6, 7);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, BinaryDecimalIsNullOrZeroUdp.class)), 2, 3, 5, 7);
}
Also used : Operators(org.apache.parquet.filter2.predicate.Operators) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) Test(org.testng.annotations.Test)

Example 22 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project presto by prestodb.

the class TestColumnIndexBuilder method testBuildDouble.

@Test
public void testBuildDouble() {
    PrimitiveType type = Types.required(DOUBLE).named("test_double");
    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    // assertThat(builder, instanceOf(DoubleColumnIndexBuilder.class));
    assertNull(builder.build());
    Operators.DoubleColumn col = doubleColumn("test_col");
    StatsBuilder sb = new StatsBuilder();
    builder.add(sb.stats(type, -4.2, -4.1));
    builder.add(sb.stats(type, -11.7, 7.0, null));
    builder.add(sb.stats(type, 2.2, 2.2, null, null));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, 1.9, 2.32));
    builder.add(sb.stats(type, -21.0, 8.1));
    assertEquals(6, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    ColumnIndex columnIndex = builder.build();
    assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0);
    assertCorrectNullPages(columnIndex, false, false, false, true, false, false);
    assertCorrectValues(columnIndex.getMaxValues(), -4.1, 7.0, 2.2, null, 2.32, 8.1);
    assertCorrectValues(columnIndex.getMinValues(), -4.2, -11.7, 2.2, null, 1.9, -21.0);
    assertCorrectFiltering(columnIndex, eq(col, 0.0), 1, 5);
    assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3);
    assertCorrectFiltering(columnIndex, notEq(col, 2.2), 0, 1, 2, 3, 4, 5);
    assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5);
    assertCorrectFiltering(columnIndex, gt(col, 2.2), 1, 4, 5);
    assertCorrectFiltering(columnIndex, gtEq(col, 2.2), 1, 2, 4, 5);
    assertCorrectFiltering(columnIndex, lt(col, -4.2), 1, 5);
    assertCorrectFiltering(columnIndex, ltEq(col, -4.2), 0, 1, 5);
    assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 4, 5);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    sb = new StatsBuilder();
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, -532.3, -345.2, null, null));
    builder.add(sb.stats(type, -234.7, -234.6, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, -234.6, 2.99999));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, 3.0, 42.83));
    builder.add(sb.stats(type, null, null));
    assertEquals(9, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    columnIndex = builder.build();
    assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2);
    assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true);
    assertCorrectValues(columnIndex.getMaxValues(), null, -345.2, -234.6, null, null, 2.99999, null, 42.83, null);
    assertCorrectValues(columnIndex.getMinValues(), null, -532.3, -234.7, null, null, -234.6, null, 3.0, null);
    assertCorrectFiltering(columnIndex, eq(col, 0.0), 5);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8);
    assertCorrectFiltering(columnIndex, notEq(col, 0.0), 0, 1, 2, 3, 4, 5, 6, 7, 8);
    assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7);
    assertCorrectFiltering(columnIndex, gt(col, 2.99999), 7);
    assertCorrectFiltering(columnIndex, gtEq(col, 2.99999), 5, 7);
    assertCorrectFiltering(columnIndex, lt(col, -234.6), 1, 2);
    assertCorrectFiltering(columnIndex, ltEq(col, -234.6), 1, 2, 5);
    assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 5, 7);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    sb = new StatsBuilder();
    builder.add(sb.stats(type, null, null, null, null, null));
    builder.add(sb.stats(type, 532.3, 345.2));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, 234.7, 234.6, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, 234.69, -2.99999));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, -3.0, -42.83));
    assertEquals(9, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    columnIndex = builder.build();
    assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0);
    assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false);
    assertCorrectValues(columnIndex.getMaxValues(), null, 532.3, null, 234.7, null, 234.69, null, null, -3.0);
    assertCorrectValues(columnIndex.getMinValues(), null, 345.2, null, 234.6, null, -2.99999, null, null, -42.83);
    assertCorrectFiltering(columnIndex, eq(col, 234.6), 3, 5);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7);
    assertCorrectFiltering(columnIndex, notEq(col, 2.2), 0, 1, 2, 3, 4, 5, 6, 7, 8);
    assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8);
    assertCorrectFiltering(columnIndex, gt(col, 2.2), 1, 3, 5);
    assertCorrectFiltering(columnIndex, gtEq(col, 234.69), 1, 3, 5);
    assertCorrectFiltering(columnIndex, lt(col, -2.99999), 8);
    assertCorrectFiltering(columnIndex, ltEq(col, -2.99999), 5, 8);
    assertCorrectFiltering(columnIndex, userDefined(col, DoubleIsInteger.class), 1, 5, 8);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, DoubleIsInteger.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
}
Also used : Operators(org.apache.parquet.filter2.predicate.Operators) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Example 23 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project presto by prestodb.

the class TestColumnIndexBuilder method testBuildFloatZeroNaN.

@Test
public void testBuildFloatZeroNaN() {
    PrimitiveType type = Types.required(FLOAT).named("test_float");
    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    StatsBuilder sb = new StatsBuilder();
    builder.add(sb.stats(type, -1.0f, -0.0f));
    builder.add(sb.stats(type, 0.0f, 1.0f));
    builder.add(sb.stats(type, 1.0f, 100.0f));
    ColumnIndex columnIndex = builder.build();
    assertCorrectValues(columnIndex.getMinValues(), -1.0f, -0.0f, 1.0f);
    assertCorrectValues(columnIndex.getMaxValues(), 0.0f, 1.0f, 100.0f);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    builder.add(sb.stats(type, -1.0f, -0.0f));
    builder.add(sb.stats(type, 0.0f, Float.NaN));
    builder.add(sb.stats(type, 1.0f, 100.0f));
    assertNull(builder.build());
}
Also used : ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Example 24 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project drill by apache.

the class ParquetFileWriter method serializeColumnIndexes.

private static void serializeColumnIndexes(List<List<ColumnIndex>> columnIndexes, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
    LOG.debug("{}: column indexes", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
        BlockMetaData block = blocks.get(bIndex);
        List<ColumnChunkMetaData> columns = block.getColumns();
        List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
        for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
            ColumnChunkMetaData column = columns.get(cIndex);
            org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter.toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
            if (columnIndex == null) {
                continue;
            }
            BlockCipher.Encryptor columnIndexEncryptor = null;
            byte[] columnIndexAAD = null;
            if (null != fileEncryptor) {
                InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
                if (columnEncryptionSetup.isEncrypted()) {
                    columnIndexEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
                    columnIndexAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.ColumnIndex, block.getOrdinal(), columnEncryptionSetup.getOrdinal(), -1);
                }
            }
            long offset = out.getPos();
            Util.writeColumnIndex(columnIndex, out, columnIndexEncryptor, columnIndexAAD);
            column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BlockCipher(org.apache.parquet.format.BlockCipher) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup) IndexReference(org.apache.parquet.internal.hadoop.metadata.IndexReference)

Example 25 with ColumnIndex

use of org.apache.parquet.internal.column.columnindex.ColumnIndex in project parquet-mr by apache.

the class CompressionConveterTest method validColumnIndex.

private void validColumnIndex(String inputFile, String outFile) throws Exception {
    ParquetMetadata inMetaData = ParquetFileReader.readFooter(conf, new Path(inputFile), NO_FILTER);
    ParquetMetadata outMetaData = ParquetFileReader.readFooter(conf, new Path(outFile), NO_FILTER);
    Assert.assertEquals(inMetaData.getBlocks().size(), outMetaData.getBlocks().size());
    try (TransParquetFileReader inReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(inputFile), conf), HadoopReadOptions.builder(conf).build());
        TransParquetFileReader outReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(outFile), conf), HadoopReadOptions.builder(conf).build())) {
        for (int i = 0; i < inMetaData.getBlocks().size(); i++) {
            BlockMetaData inBlockMetaData = inMetaData.getBlocks().get(i);
            BlockMetaData outBlockMetaData = outMetaData.getBlocks().get(i);
            Assert.assertEquals(inBlockMetaData.getColumns().size(), outBlockMetaData.getColumns().size());
            for (int j = 0; j < inBlockMetaData.getColumns().size(); j++) {
                ColumnChunkMetaData inChunk = inBlockMetaData.getColumns().get(j);
                ColumnIndex inColumnIndex = inReader.readColumnIndex(inChunk);
                OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
                ColumnChunkMetaData outChunk = outBlockMetaData.getColumns().get(j);
                ColumnIndex outColumnIndex = outReader.readColumnIndex(outChunk);
                OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
                if (inColumnIndex != null) {
                    Assert.assertEquals(inColumnIndex.getBoundaryOrder(), outColumnIndex.getBoundaryOrder());
                    Assert.assertEquals(inColumnIndex.getMaxValues(), outColumnIndex.getMaxValues());
                    Assert.assertEquals(inColumnIndex.getMinValues(), outColumnIndex.getMinValues());
                    Assert.assertEquals(inColumnIndex.getNullCounts(), outColumnIndex.getNullCounts());
                }
                if (inOffsetIndex != null) {
                    List<Long> inOffsets = getOffsets(inReader, inChunk);
                    List<Long> outOffsets = getOffsets(outReader, outChunk);
                    Assert.assertEquals(inOffsets.size(), outOffsets.size());
                    Assert.assertEquals(inOffsets.size(), inOffsetIndex.getPageCount());
                    Assert.assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
                    for (int k = 0; k < inOffsetIndex.getPageCount(); k++) {
                        Assert.assertEquals(inOffsetIndex.getFirstRowIndex(k), outOffsetIndex.getFirstRowIndex(k));
                        Assert.assertEquals(inOffsetIndex.getLastRowIndex(k, inChunk.getValueCount()), outOffsetIndex.getLastRowIndex(k, outChunk.getValueCount()));
                        Assert.assertEquals(inOffsetIndex.getOffset(k), (long) inOffsets.get(k));
                        Assert.assertEquals(outOffsetIndex.getOffset(k), (long) outOffsets.get(k));
                    }
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) TransParquetFileReader(org.apache.parquet.hadoop.util.CompressionConverter.TransParquetFileReader) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Aggregations

ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)28 Test (org.testng.annotations.Test)17 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 PrimitiveType (org.apache.parquet.schema.PrimitiveType)11 Operators (org.apache.parquet.filter2.predicate.Operators)9 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)6 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 Path (org.apache.hadoop.fs.Path)3 MessageType (org.apache.parquet.schema.MessageType)3 IOException (java.io.IOException)2 ByteBuffer (java.nio.ByteBuffer)2 BytesInput (org.apache.parquet.bytes.BytesInput)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)2 PageReadStore (org.apache.parquet.column.page.PageReadStore)2 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)2 BlockCipher (org.apache.parquet.format.BlockCipher)2 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)2 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)2 Test (org.junit.Test)2