Search in sources :

Example 31 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class QueryGenerator method seedToPredicate.

private FilterPredicate seedToPredicate(final ParquetElementSeed seed, final SeededGraphFilters.IncludeIncomingOutgoingType includeIncomingOutgoingType, final SeedMatching.SeedMatchingType seedMatchingType, final String group, final boolean reversed) {
    final boolean isEntityGroup = schemaUtils.getEntityGroups().contains(group);
    FilterPredicate filter = null;
    final ElementId elementId = seed.getElementId();
    // Is it an entity group?
    if (isEntityGroup) {
        // EntityId case
        if (elementId instanceof EntityId) {
            filter = getIsEqualFilter(ParquetStore.VERTEX, ((ParquetEntitySeed) seed).getSeed(), group);
        } else {
            // EdgeId case
            // Does the seed type need to match the group type?
            final ParquetEdgeSeed edgeSeed = (ParquetEdgeSeed) seed;
            if (seedMatchingType != SeedMatching.SeedMatchingType.EQUAL) {
                // Vertex = source of edge seed or Vertex = destination of edge seed
                // look in partition 0 with filter src = A and partition 1 with filter src = B
                filter = getIsEqualFilter(ParquetStore.VERTEX, edgeSeed.getSource(), group);
                if (null != ((ParquetEdgeSeed) seed).getDestination()) {
                    filter = FilterPredicateUtils.or(filter, getIsEqualFilter(ParquetStore.VERTEX, edgeSeed.getDestination(), group));
                }
            }
        }
    } else {
        // EntityId case
        if (elementId instanceof EntityId) {
            // If seedMatchingType is EQUAL then we can't find anything in an edge group
            if (seedMatchingType != SeedMatching.SeedMatchingType.EQUAL) {
                if (includeIncomingOutgoingType == SeededGraphFilters.IncludeIncomingOutgoingType.INCOMING) {
                    if (reversed) {
                        // Dst is seed
                        filter = getIsEqualFilter(ParquetStore.DESTINATION, ((ParquetEntitySeed) seed).getSeed(), group);
                    } else {
                        // Src is seed and edge is undirected
                        filter = getIsEqualFilter(ParquetStore.SOURCE, ((ParquetEntitySeed) seed).getSeed(), group);
                        filter = FilterPredicateUtils.and(filter, getIsEqualFilter(ParquetStore.DIRECTED, new Object[] { false }, group));
                    }
                } else if (includeIncomingOutgoingType == SeededGraphFilters.IncludeIncomingOutgoingType.OUTGOING) {
                    if (reversed) {
                        // Dst is seed and edge is undirected
                        filter = getIsEqualFilter(ParquetStore.DESTINATION, ((ParquetEntitySeed) seed).getSeed(), group);
                        filter = FilterPredicateUtils.and(filter, getIsEqualFilter(ParquetStore.DIRECTED, new Object[] { false }, group));
                    } else {
                        // Src is seed
                        filter = getIsEqualFilter(ParquetStore.SOURCE, ((ParquetEntitySeed) seed).getSeed(), group);
                    }
                } else {
                    if (reversed) {
                        // Dst is seed
                        filter = getIsEqualFilter(ParquetStore.DESTINATION, ((ParquetEntitySeed) seed).getSeed(), group);
                    } else {
                        // Src is seed
                        filter = getIsEqualFilter(ParquetStore.SOURCE, ((ParquetEntitySeed) seed).getSeed(), group);
                    }
                }
            }
        } else {
            // EdgeId case
            final ParquetEdgeSeed edgeSeed = (ParquetEdgeSeed) seed;
            if (!reversed) {
                // Src is source of edge seed and destination is destination of edge seed
                filter = getIsEqualFilter(ParquetStore.SOURCE, edgeSeed.getSource(), group);
                // WRONG seed is already serialised source and dest - now fixed?
                filter = FilterPredicateUtils.and(filter, getIsEqualFilter(ParquetStore.DESTINATION, edgeSeed.getDestination(), group));
                final DirectedType directedType = edgeSeed.getDirectedType();
                if (directedType == DirectedType.DIRECTED) {
                    filter = FilterPredicateUtils.and(filter, getIsEqualFilter(ParquetStore.DIRECTED, new Object[] { true }, group));
                } else if (directedType == DirectedType.UNDIRECTED) {
                    filter = FilterPredicateUtils.and(filter, getIsEqualFilter(ParquetStore.DIRECTED, new Object[] { false }, group));
                }
            } else {
                // TODO Optimise this - there are times this is unnecessary
                filter = getIsEqualFilter(ParquetStore.DESTINATION, edgeSeed.getSource(), group);
                filter = FilterPredicateUtils.and(filter, getIsEqualFilter(ParquetStore.SOURCE, edgeSeed.getDestination(), group));
                final DirectedType directedType = edgeSeed.getDirectedType();
                if (directedType == DirectedType.DIRECTED) {
                    filter = FilterPredicateUtils.and(filter, getIsEqualFilter(ParquetStore.DIRECTED, new Object[] { true }, group));
                } else if (directedType == DirectedType.UNDIRECTED) {
                    filter = FilterPredicateUtils.and(filter, getIsEqualFilter(ParquetStore.DIRECTED, new Object[] { false }, group));
                }
            }
        }
    }
    LOGGER.debug("Returning {} from seedToPredicate", filter);
    return filter;
}
Also used : EntityId(uk.gov.gchq.gaffer.data.element.id.EntityId) DirectedType(uk.gov.gchq.gaffer.data.element.id.DirectedType) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) ElementId(uk.gov.gchq.gaffer.data.element.id.ElementId)

Example 32 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class QueryGenerator method getPathsAndFiltersForAllElements.

private ParquetQuery getPathsAndFiltersForAllElements(final GetAllElements getAllElements) throws IOException, OperationException {
    // Stage 1: Use the view to identify all groups that might contain data
    final Set<String> allRelevantGroups = getRelevantGroups(getAllElements.getView());
    // Stage 2: Create map from group to list of files containing data for that group
    final Map<String, List<Path>> groupToPaths = new HashMap<>();
    for (final String group : allRelevantGroups) {
        groupToPaths.put(group, store.getFilesForGroup(group));
    }
    // Stage 3: For each of the above groups, create a Parquet predicate from the view and directedType
    final Map<String, Pair<FilterPredicate, Boolean>> groupToPredicate = new HashMap<>();
    for (final String group : groupToPaths.keySet()) {
        Pair<FilterPredicate, Boolean> filter = getPredicateFromView(getAllElements.getView(), group, schemaUtils.getEntityGroups().contains(group));
        if (schemaUtils.getEdgeGroups().contains(group)) {
            final FilterPredicate directedTypeFilter = getPredicateFromDirectedType(getAllElements.getDirectedType());
            if (null != filter) {
                filter.setFirst(FilterPredicateUtils.and(filter.getFirst(), directedTypeFilter));
            } else {
                filter = new Pair<>(directedTypeFilter, false);
            }
        }
        if (null != filter) {
            groupToPredicate.put(group, filter);
        }
    }
    // Stage 4: Build a ParquetQuery by iterating through the map from group to list of Paths
    final ParquetQuery parquetQuery = new ParquetQuery();
    for (final Map.Entry<String, List<Path>> entry : groupToPaths.entrySet()) {
        for (final Path path : entry.getValue()) {
            final String group = entry.getKey();
            final ParquetFileQuery fileQuery = groupToPredicate.containsKey(group) ? new ParquetFileQuery(path, groupToPredicate.get(group).getFirst(), groupToPredicate.get(group).getSecond()) : new ParquetFileQuery(path, null, false);
            parquetQuery.add(group, fileQuery);
        }
    }
    LOGGER.info("Created ParquetQuery of {}", parquetQuery);
    return parquetQuery;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) List(java.util.List) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) HashMap(java.util.HashMap) Map(java.util.Map) Pair(uk.gov.gchq.gaffer.commonutil.pair.Pair)

Example 33 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getIsEqualFilter.

public FilterPredicate getIsEqualFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
    String[] paths = schemaUtils.getPaths(group, colName);
    if (null == paths) {
        paths = new String[1];
        paths[0] = colName;
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        if (parquetObjects[i] instanceof String) {
            tempFilter = eq(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
        } else if (parquetObjects[i] instanceof Boolean) {
            tempFilter = eq(booleanColumn(path), (Boolean) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Double) {
            tempFilter = eq(doubleColumn(path), (Double) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Float) {
            tempFilter = eq(floatColumn(path), (Float) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Integer) {
            tempFilter = eq(intColumn(path), (Integer) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Long) {
            tempFilter = eq(longColumn(path), (Long) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof java.util.Date) {
            tempFilter = eq(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof java.sql.Date) {
            tempFilter = eq(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof Short) {
            tempFilter = eq(intColumn(path), ((Short) parquetObjects[i]).intValue());
        } else if (parquetObjects[i] instanceof byte[]) {
            tempFilter = eq(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
        } else {
            fullyApplied = false;
            LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsEqual filter, therefore execution will take longer to perform this filter.");
            return null;
        }
        if (null == filter) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 34 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getIsLessThanOrEqualToFilter.

private FilterPredicate getIsLessThanOrEqualToFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
    String[] paths = schemaUtils.getPaths(group, colName);
    if (null == paths) {
        paths = new String[1];
        paths[0] = colName;
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        if (parquetObjects[i] instanceof String) {
            tempFilter = ltEq(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
        } else if (parquetObjects[i] instanceof Double) {
            tempFilter = ltEq(doubleColumn(path), (Double) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Float) {
            tempFilter = ltEq(floatColumn(path), (Float) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Integer) {
            tempFilter = ltEq(intColumn(path), (Integer) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Long) {
            tempFilter = ltEq(longColumn(path), (Long) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof java.util.Date) {
            tempFilter = ltEq(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof java.sql.Date) {
            tempFilter = ltEq(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof Short) {
            tempFilter = ltEq(intColumn(path), ((Short) parquetObjects[i]).intValue());
        } else if (parquetObjects[i] instanceof byte[]) {
            tempFilter = ltEq(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
        } else {
            fullyApplied = false;
            LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsLessThanOrEqualTo filter, therefore execution will take longer to perform this filter.");
            return null;
        }
        if (null == filter) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 35 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getOrFilter.

public FilterPredicate getOrFilter(final List<Predicate> predicateList, final String[] selection, final String group, final SchemaUtils schemaUtils) throws SerialisationException {
    FilterPredicate combinedFilter = null;
    for (final Predicate predicate : predicateList) {
        final Predicate filterFunction;
        final String[] newSelection;
        if (predicate instanceof TupleAdaptedPredicate) {
            filterFunction = ((TupleAdaptedPredicate) predicate).getPredicate();
            // Build new selections
            final Integer[] ints = (Integer[]) ((TupleAdaptedPredicate) predicate).getSelection();
            newSelection = new String[ints.length];
            for (int x = 0; x < ints.length; x++) {
                newSelection[x] = selection[ints[x]];
            }
        } else {
            filterFunction = predicate;
            newSelection = selection;
        }
        final JavaPredicateToParquetPredicate predicateConverter = new JavaPredicateToParquetPredicate(schemaUtils, filterFunction, newSelection, group);
        final FilterPredicate parquetPredicate = predicateConverter.getParquetPredicate();
        if (!predicateConverter.fullyApplied) {
            fullyApplied = false;
        }
        combinedFilter = FilterPredicateUtils.or(combinedFilter, parquetPredicate);
    }
    return combinedFilter;
}
Also used : TupleAdaptedPredicate(uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) TupleAdaptedPredicate(uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate) Predicate(java.util.function.Predicate)

Aggregations

FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)57 Test (org.junit.Test)33 MessageType (org.apache.parquet.schema.MessageType)15 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)8 BinaryColumn (org.apache.parquet.filter2.predicate.Operators.BinaryColumn)8 ArrayList (java.util.ArrayList)5 List (java.util.List)5 Group (org.apache.parquet.example.data.Group)5 Configuration (org.apache.hadoop.conf.Configuration)4 User (org.apache.parquet.filter2.recordlevel.PhoneBookWriter.User)4 TupleAdaptedPredicate (uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate)4 Predicate (java.util.function.Predicate)3 Path (org.apache.hadoop.fs.Path)3 Pair (uk.gov.gchq.gaffer.commonutil.pair.Pair)3 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)2 IntColumn (org.apache.parquet.filter2.predicate.Operators.IntColumn)2 Test (org.junit.jupiter.api.Test)2