Search in sources :

Example 11 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project hive by apache.

the class FilterPredicateLeafBuilder method buildPredicate.

/**
 * Build filter predicate with multiple constants
 *
 * @param op         IN or BETWEEN
 * @param literals
 * @param columnName
 * @return
 */
public FilterPredicate buildPredicate(PredicateLeaf.Operator op, List<Object> literals, String columnName) throws Exception {
    FilterPredicate result = null;
    switch(op) {
        case IN:
            for (Object literal : literals) {
                if (result == null) {
                    result = buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName);
                } else {
                    result = or(result, buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName));
                }
            }
            return result;
        case BETWEEN:
            if (literals.size() != 2) {
                throw new RuntimeException("Not able to build 'between' operation filter with " + literals + " which needs two literals");
            }
            Object min = literals.get(0);
            Object max = literals.get(1);
            FilterPredicate lt = not(buildPredict(PredicateLeaf.Operator.LESS_THAN, min, columnName));
            FilterPredicate gt = buildPredict(PredicateLeaf.Operator.LESS_THAN_EQUALS, max, columnName);
            result = FilterApi.and(gt, lt);
            return result;
        default:
            throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op);
    }
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 12 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project parquet-mr by apache.

the class ParquetLoader method setInput.

private void setInput(String location, Job job) throws IOException {
    this.setLocationHasBeenCalled = true;
    this.location = location;
    setInputPaths(job, location);
    // not file metadata or pig framework and would get overwritten in initSchema().
    if (UDFContext.getUDFContext().isFrontend()) {
        storeInUDFContext(PARQUET_COLUMN_INDEX_ACCESS, Boolean.toString(columnIndexAccess));
    }
    schema = PigSchemaConverter.parsePigSchema(getPropertyFromUDFContext(PARQUET_PIG_SCHEMA));
    requiredFieldList = PigSchemaConverter.deserializeRequiredFieldList(getPropertyFromUDFContext(PARQUET_PIG_REQUIRED_FIELDS));
    columnIndexAccess = Boolean.parseBoolean(getPropertyFromUDFContext(PARQUET_COLUMN_INDEX_ACCESS));
    initSchema(job);
    if (UDFContext.getUDFContext().isFrontend()) {
        // Setting for task-side loading via initSchema()
        storeInUDFContext(PARQUET_PIG_SCHEMA, pigSchemaToString(schema));
        storeInUDFContext(PARQUET_PIG_REQUIRED_FIELDS, serializeRequiredFieldList(requiredFieldList));
    }
    // Used by task-side loader via TupleReadSupport
    getConfiguration(job).set(PARQUET_PIG_SCHEMA, pigSchemaToString(schema));
    getConfiguration(job).set(PARQUET_PIG_REQUIRED_FIELDS, serializeRequiredFieldList(requiredFieldList));
    getConfiguration(job).set(PARQUET_COLUMN_INDEX_ACCESS, Boolean.toString(columnIndexAccess));
    FilterPredicate filterPredicate = (FilterPredicate) getFromUDFContext(ParquetInputFormat.FILTER_PREDICATE);
    if (filterPredicate != null) {
        ParquetInputFormat.setFilterPredicate(getConfiguration(job), filterPredicate);
    }
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 13 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project parquet-mr by apache.

the class TestFilterApiMethods method testSerializable.

@Test
public void testSerializable() throws Exception {
    BinaryColumn binary = binaryColumn("foo");
    FilterPredicate p = and(or(and(userDefined(intColumn, DummyUdp.class), predicate), eq(binary, Binary.fromString("hi"))), userDefined(longColumn, new IsMultipleOf(7)));
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    ObjectOutputStream oos = new ObjectOutputStream(baos);
    oos.writeObject(p);
    oos.close();
    ObjectInputStream is = new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray()));
    FilterPredicate read = (FilterPredicate) is.readObject();
    assertEquals(p, read);
}
Also used : BinaryColumn(org.apache.parquet.filter2.predicate.Operators.BinaryColumn) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) ObjectInputStream(java.io.ObjectInputStream) Test(org.junit.Test)

Example 14 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project parquet-mr by apache.

the class RowGroupFilter method visit.

@Override
public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) {
    FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate();
    // check that the schema of the filter matches the schema of the file
    SchemaCompatibilityValidator.validate(filterPredicate, schema);
    List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>();
    for (BlockMetaData block : blocks) {
        boolean drop = false;
        if (levels.contains(FilterLevel.STATISTICS)) {
            drop = StatisticsFilter.canDrop(filterPredicate, block.getColumns());
        }
        if (!drop && levels.contains(FilterLevel.DICTIONARY)) {
            drop = DictionaryFilter.canDrop(filterPredicate, block.getColumns(), reader.getDictionaryReader(block));
        }
        if (!drop) {
            filteredBlocks.add(block);
        }
    }
    return filteredBlocks;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ArrayList(java.util.ArrayList) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 15 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project parquet-mr by apache.

the class DictionaryFilterTest method testInverseUdp.

@Test
public void testInverseUdp() throws Exception {
    InInt32UDP droppable = new InInt32UDP(ImmutableSet.of(42));
    InInt32UDP undroppable = new InInt32UDP(ImmutableSet.of(205));
    Set<Integer> allValues = ImmutableSet.copyOf(Arrays.asList(ArrayUtils.toObject(intValues)));
    InInt32UDP completeMatch = new InInt32UDP(allValues);
    FilterPredicate inverse = LogicalInverseRewriter.rewrite(not(userDefined(intColumn("int32_field"), droppable)));
    FilterPredicate inverse1 = LogicalInverseRewriter.rewrite(not(userDefined(intColumn("int32_field"), undroppable)));
    FilterPredicate inverse2 = LogicalInverseRewriter.rewrite(not(userDefined(intColumn("int32_field"), completeMatch)));
    assertFalse("Should not drop block for inverse of non-matching UDP", canDrop(inverse, ccmd, dictionaries));
    assertFalse("Should not drop block for inverse of UDP with some matches", canDrop(inverse1, ccmd, dictionaries));
    assertTrue("Should drop block for inverse of UDP with all matches", canDrop(inverse2, ccmd, dictionaries));
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) Test(org.junit.Test)

Aggregations

FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)57 Test (org.junit.Test)33 MessageType (org.apache.parquet.schema.MessageType)15 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)8 BinaryColumn (org.apache.parquet.filter2.predicate.Operators.BinaryColumn)8 ArrayList (java.util.ArrayList)5 List (java.util.List)5 Group (org.apache.parquet.example.data.Group)5 Configuration (org.apache.hadoop.conf.Configuration)4 User (org.apache.parquet.filter2.recordlevel.PhoneBookWriter.User)4 TupleAdaptedPredicate (uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate)4 Predicate (java.util.function.Predicate)3 Path (org.apache.hadoop.fs.Path)3 Pair (uk.gov.gchq.gaffer.commonutil.pair.Pair)3 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)2 IntColumn (org.apache.parquet.filter2.predicate.Operators.IntColumn)2 Test (org.junit.jupiter.api.Test)2