Search in sources :

Example 16 with IntColumn

use of org.apache.parquet.filter2.predicate.Operators.IntColumn in project parquet-mr by apache.

the class TestRowGroupFilter method testApplyRowGroupFilters.

@Test
public void testApplyRowGroupFilters() {
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    IntStatistics stats1 = new IntStatistics();
    stats1.setMinMax(10, 100);
    stats1.setNumNulls(4);
    BlockMetaData b1 = makeBlockFromStats(stats1, 301);
    blocks.add(b1);
    IntStatistics stats2 = new IntStatistics();
    stats2.setMinMax(8, 102);
    stats2.setNumNulls(0);
    BlockMetaData b2 = makeBlockFromStats(stats2, 302);
    blocks.add(b2);
    IntStatistics stats3 = new IntStatistics();
    stats3.setMinMax(100, 102);
    stats3.setNumNulls(12);
    BlockMetaData b3 = makeBlockFromStats(stats3, 303);
    blocks.add(b3);
    IntStatistics stats4 = new IntStatistics();
    stats4.setMinMax(0, 0);
    stats4.setNumNulls(304);
    BlockMetaData b4 = makeBlockFromStats(stats4, 304);
    blocks.add(b4);
    IntStatistics stats5 = new IntStatistics();
    stats5.setMinMax(50, 50);
    stats5.setNumNulls(7);
    BlockMetaData b5 = makeBlockFromStats(stats5, 305);
    blocks.add(b5);
    IntStatistics stats6 = new IntStatistics();
    stats6.setMinMax(0, 0);
    stats6.setNumNulls(12);
    BlockMetaData b6 = makeBlockFromStats(stats6, 306);
    blocks.add(b6);
    MessageType schema = MessageTypeParser.parseMessageType("message Document { optional int32 foo; }");
    IntColumn foo = intColumn("foo");
    List<BlockMetaData> filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, 50)), blocks, schema);
    assertEquals(Arrays.asList(b1, b2, b5), filtered);
    filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(notEq(foo, 50)), blocks, schema);
    assertEquals(Arrays.asList(b1, b2, b3, b4, b5, b6), filtered);
    filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, null)), blocks, schema);
    assertEquals(Arrays.asList(b1, b3, b4, b5, b6), filtered);
    filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(notEq(foo, null)), blocks, schema);
    assertEquals(Arrays.asList(b1, b2, b3, b5, b6), filtered);
    filtered = RowGroupFilter.filterRowGroups(FilterCompat.get(eq(foo, 0)), blocks, schema);
    assertEquals(Arrays.asList(b6), filtered);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType) IntColumn(org.apache.parquet.filter2.predicate.Operators.IntColumn) Test(org.junit.Test)

Example 17 with IntColumn

use of org.apache.parquet.filter2.predicate.Operators.IntColumn in project parquet-mr by apache.

the class DictionaryFilterTest method testInverseUdpMissingColumn.

@Test
public void testInverseUdpMissingColumn() throws Exception {
    InInt32UDP nullRejecting = new InInt32UDP(ImmutableSet.of(42));
    InInt32UDP nullAccepting = new InInt32UDP(Sets.newHashSet((Integer) null));
    IntColumn fake = intColumn("missing_column");
    assertTrue("Should drop block for null accepting udp", canDrop(LogicalInverseRewriter.rewrite(not(userDefined(fake, nullAccepting))), ccmd, dictionaries));
    assertFalse("Should not drop block for null rejecting udp", canDrop(LogicalInverseRewriter.rewrite(not(userDefined(fake, nullRejecting))), ccmd, dictionaries));
}
Also used : IntColumn(org.apache.parquet.filter2.predicate.Operators.IntColumn) Test(org.junit.Test)

Example 18 with IntColumn

use of org.apache.parquet.filter2.predicate.Operators.IntColumn in project parquet-mr by apache.

the class DictionaryFilterTest method testColumnWithDictionaryAndPlainEncodings.

@Test
public void testColumnWithDictionaryAndPlainEncodings() throws Exception {
    IntColumn plain = intColumn("fallback_binary_field");
    DictionaryPageReadStore dictionaryStore = mock(DictionaryPageReadStore.class);
    assertFalse("Should never drop block using plain encoding", canDrop(eq(plain, -10), ccmd, dictionaryStore));
    assertFalse("Should never drop block using plain encoding", canDrop(lt(plain, -10), ccmd, dictionaryStore));
    assertFalse("Should never drop block using plain encoding", canDrop(ltEq(plain, -10), ccmd, dictionaryStore));
    assertFalse("Should never drop block using plain encoding", canDrop(gt(plain, nElements + 10), ccmd, dictionaryStore));
    assertFalse("Should never drop block using plain encoding", canDrop(gtEq(plain, nElements + 10), ccmd, dictionaryStore));
    assertFalse("Should never drop block using plain encoding", canDrop(notEq(plain, nElements + 10), ccmd, dictionaryStore));
    verifyZeroInteractions(dictionaryStore);
}
Also used : DictionaryPageReadStore(org.apache.parquet.column.page.DictionaryPageReadStore) IntColumn(org.apache.parquet.filter2.predicate.Operators.IntColumn) Test(org.junit.Test)

Example 19 with IntColumn

use of org.apache.parquet.filter2.predicate.Operators.IntColumn in project parquet-mr by apache.

the class TestStatisticsFilter method testOr.

@Test
public void testOr() {
    FilterPredicate yes = eq(intColumn, 9);
    FilterPredicate no = eq(doubleColumn, 50D);
    assertTrue(canDrop(or(yes, yes), columnMetas));
    assertFalse(canDrop(or(yes, no), columnMetas));
    assertFalse(canDrop(or(no, yes), columnMetas));
    assertFalse(canDrop(or(no, no), columnMetas));
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) Test(org.junit.Test)

Example 20 with IntColumn

use of org.apache.parquet.filter2.predicate.Operators.IntColumn in project presto by prestodb.

the class TestColumnIndexBuilder method testBuildUInt8.

@Test
public void testBuildUInt8() {
    PrimitiveType type = Types.required(INT32).as(UINT_8).named("test_uint8");
    ColumnIndexBuilder builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    // assertThat(builder, instanceOf(IntColumnIndexBuilder.class));
    assertNull(builder.build());
    Operators.IntColumn col = intColumn("test_col");
    StatsBuilder sb = new StatsBuilder();
    builder.add(sb.stats(type, 4, 10));
    builder.add(sb.stats(type, 11, 17, null));
    builder.add(sb.stats(type, 2, 2, null, null));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, 1, 0xFF));
    builder.add(sb.stats(type, 0xEF, 0xFA));
    assertEquals(6, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    ColumnIndex columnIndex = builder.build();
    assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 0, 1, 2, 3, 0, 0);
    assertCorrectNullPages(columnIndex, false, false, false, true, false, false);
    assertCorrectValues(columnIndex.getMaxValues(), 10, 17, 2, null, 0xFF, 0xFA);
    assertCorrectValues(columnIndex.getMinValues(), 4, 11, 2, null, 1, 0xEF);
    assertCorrectFiltering(columnIndex, eq(col, 2), 2, 4);
    assertCorrectFiltering(columnIndex, eq(col, null), 1, 2, 3);
    assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5);
    assertCorrectFiltering(columnIndex, notEq(col, null), 0, 1, 2, 4, 5);
    assertCorrectFiltering(columnIndex, gt(col, 2), 0, 1, 4, 5);
    assertCorrectFiltering(columnIndex, gtEq(col, 2), 0, 1, 2, 4, 5);
    assertCorrectFiltering(columnIndex, lt(col, 0xEF), 0, 1, 2, 4);
    assertCorrectFiltering(columnIndex, ltEq(col, 0xEF), 0, 1, 2, 4, 5);
    assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 0, 1, 4, 5);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    sb = new StatsBuilder();
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, 0, 0, null, null));
    builder.add(sb.stats(type, 0, 42, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, 42, 0xEE));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, 0xEF, 0xFF));
    builder.add(sb.stats(type, null, null));
    assertEquals(9, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    columnIndex = builder.build();
    assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 2, 2, 1, 2, 3, 0, 2, 0, 2);
    assertCorrectNullPages(columnIndex, true, false, false, true, true, false, true, false, true);
    assertCorrectValues(columnIndex.getMaxValues(), null, 0, 42, null, null, 0xEE, null, 0xFF, null);
    assertCorrectValues(columnIndex.getMinValues(), null, 0, 0, null, null, 42, null, 0xEF, null);
    assertCorrectFiltering(columnIndex, eq(col, 2), 2);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 1, 2, 3, 4, 6, 8);
    assertCorrectFiltering(columnIndex, notEq(col, 2), 0, 1, 2, 3, 4, 5, 6, 7, 8);
    assertCorrectFiltering(columnIndex, notEq(col, null), 1, 2, 5, 7);
    assertCorrectFiltering(columnIndex, gt(col, 0xEE), 7);
    assertCorrectFiltering(columnIndex, gtEq(col, 0xEE), 5, 7);
    assertCorrectFiltering(columnIndex, lt(col, 42), 1, 2);
    assertCorrectFiltering(columnIndex, ltEq(col, 42), 1, 2, 5);
    assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 2, 5, 7);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 1, 2, 3, 4, 5, 6, 7, 8);
    builder = ColumnIndexBuilder.getBuilder(type, Integer.MAX_VALUE);
    sb = new StatsBuilder();
    builder.add(sb.stats(type, null, null, null, null, null));
    builder.add(sb.stats(type, 0xFF, 0xFF));
    builder.add(sb.stats(type, null, null, null));
    builder.add(sb.stats(type, 0xEF, 0xEA, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, 0xEE, 42));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, null, null));
    builder.add(sb.stats(type, 41, 0));
    assertEquals(9, builder.getPageCount());
    assertEquals(sb.getMinMaxSize(), builder.getMinMaxSize());
    columnIndex = builder.build();
    assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
    assertCorrectNullCounts(columnIndex, 5, 0, 3, 1, 2, 0, 2, 2, 0);
    assertCorrectNullPages(columnIndex, true, false, true, false, true, false, true, true, false);
    assertCorrectValues(columnIndex.getMaxValues(), null, 0xFF, null, 0xEF, null, 0xEE, null, null, 41);
    assertCorrectValues(columnIndex.getMinValues(), null, 0xFF, null, 0xEA, null, 42, null, null, 0);
    assertCorrectFiltering(columnIndex, eq(col, 0xAB), 5);
    assertCorrectFiltering(columnIndex, eq(col, null), 0, 2, 3, 4, 6, 7);
    assertCorrectFiltering(columnIndex, notEq(col, 0xFF), 0, 2, 3, 4, 5, 6, 7, 8);
    assertCorrectFiltering(columnIndex, notEq(col, null), 1, 3, 5, 8);
    assertCorrectFiltering(columnIndex, gt(col, 0xFF));
    assertCorrectFiltering(columnIndex, gtEq(col, 0xFF), 1);
    assertCorrectFiltering(columnIndex, lt(col, 42), 8);
    assertCorrectFiltering(columnIndex, ltEq(col, 42), 5, 8);
    assertCorrectFiltering(columnIndex, userDefined(col, IntegerIsDivisableWith3.class), 1, 3, 5, 8);
    assertCorrectFiltering(columnIndex, invert(userDefined(col, IntegerIsDivisableWith3.class)), 0, 2, 3, 4, 5, 6, 7, 8);
}
Also used : Operators(org.apache.parquet.filter2.predicate.Operators) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) ColumnIndexBuilder(org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Test(org.testng.annotations.Test)

Aggregations

Test (org.junit.Test)14 FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)13 IntColumn (org.apache.parquet.filter2.predicate.Operators.IntColumn)8 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)3 Configuration (org.apache.hadoop.conf.Configuration)2 DictionaryPageReadStore (org.apache.parquet.column.page.DictionaryPageReadStore)2 Operators (org.apache.parquet.filter2.predicate.Operators)2 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)2 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)2 PrimitiveType (org.apache.parquet.schema.PrimitiveType)2 Test (org.testng.annotations.Test)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 ObjectInputStream (java.io.ObjectInputStream)1 ObjectOutputStream (java.io.ObjectOutputStream)1 ArrayList (java.util.ArrayList)1 Job (org.apache.hadoop.mapreduce.Job)1 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)1 RecordFilter (org.apache.parquet.filter.RecordFilter)1 UnboundRecordFilter (org.apache.parquet.filter.UnboundRecordFilter)1