Search in sources :

Example 36 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getIsMoreThanFilter.

private FilterPredicate getIsMoreThanFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
    String[] paths = schemaUtils.getPaths(group, colName);
    if (null == paths) {
        paths = new String[1];
        paths[0] = colName;
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        if (parquetObjects[i] instanceof String) {
            tempFilter = gt(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
        } else if (parquetObjects[i] instanceof Double) {
            tempFilter = gt(doubleColumn(path), (Double) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Float) {
            tempFilter = gt(floatColumn(path), (Float) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Integer) {
            tempFilter = gt(intColumn(path), (Integer) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Long) {
            tempFilter = gt(longColumn(path), (Long) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof java.util.Date) {
            tempFilter = gt(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof java.sql.Date) {
            tempFilter = gt(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof Short) {
            tempFilter = gt(intColumn(path), ((Short) parquetObjects[i]).intValue());
        } else if (parquetObjects[i] instanceof byte[]) {
            tempFilter = gt(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
        } else {
            fullyApplied = false;
            LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsMoreThan filter, therefore execution will take longer to perform this filter.");
            return null;
        }
        if (null == filter) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 37 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getAndFilter.

public FilterPredicate getAndFilter(final List<Predicate> predicateList, final String[] selection, final String group, final SchemaUtils schemaUtils) throws SerialisationException {
    FilterPredicate combinedFilter = null;
    for (final Predicate predicate : predicateList) {
        final Predicate filterFunction;
        final String[] newSelection;
        if (predicate instanceof TupleAdaptedPredicate) {
            filterFunction = ((TupleAdaptedPredicate) predicate).getPredicate();
            // Build new selections
            final Integer[] ints = (Integer[]) ((TupleAdaptedPredicate) predicate).getSelection();
            newSelection = new String[ints.length];
            for (int x = 0; x < ints.length; x++) {
                newSelection[x] = selection[ints[x]];
            }
        } else {
            filterFunction = predicate;
            newSelection = selection;
        }
        final JavaPredicateToParquetPredicate predicateConverter = new JavaPredicateToParquetPredicate(schemaUtils, filterFunction, newSelection, group);
        final FilterPredicate parquetPredicate = predicateConverter.getParquetPredicate();
        if (!predicateConverter.fullyApplied) {
            fullyApplied = false;
        }
        combinedFilter = FilterPredicateUtils.and(combinedFilter, parquetPredicate);
    }
    return combinedFilter;
}
Also used : TupleAdaptedPredicate(uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) TupleAdaptedPredicate(uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate) Predicate(java.util.function.Predicate)

Example 38 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getIsMoreThanOrEqualToFilter.

private FilterPredicate getIsMoreThanOrEqualToFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
    String[] paths = schemaUtils.getPaths(group, colName);
    if (null == paths) {
        paths = new String[1];
        paths[0] = colName;
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        if (parquetObjects[i] instanceof String) {
            tempFilter = gtEq(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
        } else if (parquetObjects[i] instanceof Double) {
            tempFilter = gtEq(doubleColumn(path), (Double) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Float) {
            tempFilter = gtEq(floatColumn(path), (Float) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Integer) {
            tempFilter = gtEq(intColumn(path), (Integer) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Long) {
            tempFilter = gtEq(longColumn(path), (Long) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof java.util.Date) {
            tempFilter = gtEq(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof java.sql.Date) {
            tempFilter = gtEq(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof Short) {
            tempFilter = gtEq(intColumn(path), ((Short) parquetObjects[i]).intValue());
        } else if (parquetObjects[i] instanceof byte[]) {
            tempFilter = gtEq(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
        } else {
            fullyApplied = false;
            LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsMoreThanOrEqualTo filter, therefore execution will take longer to perform this filter.");
            return null;
        }
        if (null == filter) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 39 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project beam by apache.

the class ParquetIOTest method testWriteAndReadFilesAsJsonForUnknownSchemaWithConfiguration.

@Test
public void testWriteAndReadFilesAsJsonForUnknownSchemaWithConfiguration() {
    List<GenericRecord> records = generateGenericRecords(10);
    List<GenericRecord> expectedRecords = generateGenericRecords(1);
    mainPipeline.apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA))).apply(FileIO.<GenericRecord>write().via(ParquetIO.sink(SCHEMA)).to(temporaryFolder.getRoot().getAbsolutePath()));
    mainPipeline.run().waitUntilFinish();
    Configuration configuration = new Configuration();
    FilterPredicate filterPredicate = FilterApi.eq(FilterApi.binaryColumn("id"), Binary.fromString("0"));
    ParquetInputFormat.setFilterPredicate(configuration, filterPredicate);
    PCollection<String> readBackAsJson = readPipeline.apply(ParquetIO.parseGenericRecords(ParseGenericRecordAsJsonFn.create()).withConfiguration(configuration).from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));
    PAssert.that(readBackAsJson).containsInAnyOrder(convertRecordsToJson(expectedRecords));
    readPipeline.run().waitUntilFinish();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Example 40 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project beam by apache.

the class ParquetIOTest method testWriteAndReadWithConfiguration.

@Test
public void testWriteAndReadWithConfiguration() {
    List<GenericRecord> records = generateGenericRecords(10);
    List<GenericRecord> expectedRecords = generateGenericRecords(1);
    mainPipeline.apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA))).apply(FileIO.<GenericRecord>write().via(ParquetIO.sink(SCHEMA)).to(temporaryFolder.getRoot().getAbsolutePath()));
    mainPipeline.run().waitUntilFinish();
    Configuration configuration = new Configuration();
    FilterPredicate filterPredicate = FilterApi.eq(FilterApi.binaryColumn("id"), Binary.fromString("0"));
    ParquetInputFormat.setFilterPredicate(configuration, filterPredicate);
    PCollection<GenericRecord> readBack = readPipeline.apply(ParquetIO.read(SCHEMA).from(temporaryFolder.getRoot().getAbsolutePath() + "/*").withConfiguration(configuration).withSplit());
    PAssert.that(readBack).containsInAnyOrder(expectedRecords);
    readPipeline.run().waitUntilFinish();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.Test)

Aggregations

FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)57 Test (org.junit.Test)33 MessageType (org.apache.parquet.schema.MessageType)15 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)8 BinaryColumn (org.apache.parquet.filter2.predicate.Operators.BinaryColumn)8 ArrayList (java.util.ArrayList)5 List (java.util.List)5 Group (org.apache.parquet.example.data.Group)5 Configuration (org.apache.hadoop.conf.Configuration)4 User (org.apache.parquet.filter2.recordlevel.PhoneBookWriter.User)4 TupleAdaptedPredicate (uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate)4 Predicate (java.util.function.Predicate)3 Path (org.apache.hadoop.fs.Path)3 Pair (uk.gov.gchq.gaffer.commonutil.pair.Pair)3 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)2 IntColumn (org.apache.parquet.filter2.predicate.Operators.IntColumn)2 Test (org.junit.jupiter.api.Test)2