Search in sources :

Example 56 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class QueryGenerator method getPathsAndFiltersForGetElements.

private ParquetQuery getPathsAndFiltersForGetElements(final GetElements getElements) throws SerialisationException, OperationException {
    final Iterable<? extends ElementId> seeds = getElements.getInput();
    if (null == seeds || !seeds.iterator().hasNext()) {
        return new ParquetQuery();
    }
    // Stage 1: Use the view to identify all groups that might contain data
    final Set<String> allRelevantGroups = getRelevantGroups(getElements.getView());
    // Stage 2: For each of the above groups, create a Parquet predicate from the view and directedType
    final Map<String, Pair<FilterPredicate, Boolean>> groupToPredicate = new HashMap<>();
    for (final String group : allRelevantGroups) {
        Pair<FilterPredicate, Boolean> filter = getPredicateFromView(getElements.getView(), group, schemaUtils.getEntityGroups().contains(group));
        if (schemaUtils.getEdgeGroups().contains(group)) {
            final FilterPredicate directedTypeFilter = getPredicateFromDirectedType(getElements.getDirectedType());
            filter.setFirst(FilterPredicateUtils.and(filter.getFirst(), directedTypeFilter));
        }
        groupToPredicate.put(group, filter);
    }
    // Stage 3: Convert seeds to ParquetElementSeeds and create Stream of <group, ParquetElementSeed> pairs where
    // each seed appears once for each of the relevant groups
    final Stream<Pair<String, ParquetElementSeed>> groupAndSeeds = StreamSupport.stream(seeds.spliterator(), false).flatMap(seed -> {
        try {
            return seedToParquetObject(seed, allRelevantGroups).stream();
        } catch (final SerialisationException e) {
            throw new RuntimeException("SerialisationException converting seed into a Parquet object", e);
        }
    });
    // Stage 4: Convert stream of <group, ParquetElementSeed> pars to stream of tuples
    // <group, ParquetElementSeed, List<PathInfo>>
    final Stream<Tuple3<String, ParquetElementSeed, Set<PathInfo>>> groupSeedsAndPaths = groupAndSeeds.map(pair -> getRelevantFiles(pair.getFirst(), pair.getSecond()));
    // Stage 5: Create map from path to list of <group, reversed edge flag, Parquet seeds>
    // TODO: Currently this consumes the entire stream - need to do this in batches
    final List<Tuple3<String, ParquetElementSeed, Set<PathInfo>>> groupSeedsAndPathsList = groupSeedsAndPaths.collect(Collectors.toList());
    final Map<PathInfo, List<Tuple3<String, Boolean, ParquetElementSeed>>> pathToSeeds = new HashMap<>();
    for (final Tuple3<String, ParquetElementSeed, Set<PathInfo>> tuple : groupSeedsAndPathsList) {
        Set<PathInfo> paths = tuple.get2();
        for (final PathInfo pathInfo : paths) {
            if (!pathToSeeds.containsKey(pathInfo)) {
                pathToSeeds.put(pathInfo, new ArrayList<>());
            }
            pathToSeeds.get(pathInfo).add(new Tuple3<>(tuple.get0(), pathInfo.isReversed(), tuple.get1()));
        }
    }
    // Stage 6: Create ParquetQuery
    final SeededGraphFilters.IncludeIncomingOutgoingType includeIncomingOutgoingType = getElements.getIncludeIncomingOutGoing();
    final SeedMatching.SeedMatchingType seedMatchingType = getElements.getSeedMatching();
    final ParquetQuery parquetQuery = new ParquetQuery();
    for (final PathInfo pathInfo : pathToSeeds.keySet()) {
        List<Tuple3<String, Boolean, ParquetElementSeed>> seedList = pathToSeeds.get(pathInfo);
        FilterPredicate filterPredicate = seedsToPredicate(seedList, includeIncomingOutgoingType, seedMatchingType);
        if (null != filterPredicate) {
            final String group = pathInfo.getGroup();
            final Pair<FilterPredicate, Boolean> viewFilterPredicate = groupToPredicate.get(group);
            if (null != viewFilterPredicate) {
                // Put view predicate first as filter for checking whether it matches one of many seeds could be complex
                filterPredicate = FilterPredicateUtils.and(viewFilterPredicate.getFirst(), filterPredicate);
            }
            final ParquetFileQuery fileQuery = new ParquetFileQuery(pathInfo.getPath(), filterPredicate, viewFilterPredicate.getSecond());
            parquetQuery.add(group, fileQuery);
        }
    }
    LOGGER.info("Created ParquetQuery of {}", parquetQuery);
    return parquetQuery;
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) List(java.util.List) Pair(uk.gov.gchq.gaffer.commonutil.pair.Pair) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) Tuple3(uk.gov.gchq.koryphe.tuple.n.Tuple3) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) SeededGraphFilters(uk.gov.gchq.gaffer.operation.graph.SeededGraphFilters) SeedMatching(uk.gov.gchq.gaffer.operation.SeedMatching)

Example 57 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getAgeOffPredicate.

public FilterPredicate getAgeOffPredicate(final AgeOff ageOff, final String[] selection, final String group, final SchemaUtils schemaUtils) {
    String[] paths = schemaUtils.getPaths(group, selection[0]);
    if (paths == null) {
        paths = new String[1];
        paths[0] = selection[0];
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        Long ageOffTime = System.currentTimeMillis() - ageOff.getAgeOffTime();
        tempFilter = gt(longColumn(path), ageOffTime);
        if (filter == null) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 58 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getParquetPredicate.

public FilterPredicate getParquetPredicate() throws SerialisationException {
    FilterPredicate filterResult;
    if (javaPredicate instanceof AgeOff) {
        filterResult = getAgeOffPredicate((AgeOff) javaPredicate, selection, group, schemaUtils);
    } else if (javaPredicate instanceof And) {
        final And and = (And) javaPredicate;
        filterResult = getAndFilter((List<Predicate>) and.getComponents(), selection, group, schemaUtils);
    } else if (javaPredicate instanceof Or) {
        final Or or = (Or) javaPredicate;
        filterResult = getOrFilter((List<Predicate>) or.getComponents(), selection, group, schemaUtils);
    } else if (javaPredicate instanceof Not) {
        final Not not = (Not) javaPredicate;
        final JavaPredicateToParquetPredicate predicateConverter = new JavaPredicateToParquetPredicate(schemaUtils, not.getPredicate(), selection, group);
        final FilterPredicate parquetPredicate = predicateConverter.getParquetPredicate();
        if (!predicateConverter.fullyApplied) {
            fullyApplied = false;
        }
        filterResult = FilterPredicateUtils.not(parquetPredicate);
    } else {
        filterResult = getPrimitiveFilter(javaPredicate, selection[0], group, schemaUtils);
    }
    return filterResult;
}
Also used : Not(uk.gov.gchq.koryphe.impl.predicate.Not) Or(uk.gov.gchq.koryphe.impl.predicate.Or) AgeOff(uk.gov.gchq.koryphe.impl.predicate.AgeOff) And(uk.gov.gchq.koryphe.impl.predicate.And) List(java.util.List) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) TupleAdaptedPredicate(uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate) Predicate(java.util.function.Predicate)

Example 59 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getIsLessThanFilter.

private FilterPredicate getIsLessThanFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
    String[] paths = schemaUtils.getPaths(group, colName);
    if (null == paths) {
        paths = new String[1];
        paths[0] = colName;
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        if (parquetObjects[i] instanceof String) {
            tempFilter = lt(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
        } else if (parquetObjects[i] instanceof Double) {
            tempFilter = lt(doubleColumn(path), (Double) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Float) {
            tempFilter = lt(floatColumn(path), (Float) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Integer) {
            tempFilter = lt(intColumn(path), (Integer) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Long) {
            tempFilter = lt(longColumn(path), (Long) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof java.util.Date) {
            tempFilter = lt(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof java.sql.Date) {
            tempFilter = lt(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof Short) {
            tempFilter = lt(intColumn(path), ((Short) parquetObjects[i]).intValue());
        } else if (parquetObjects[i] instanceof byte[]) {
            tempFilter = lt(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
        } else {
            fullyApplied = false;
            LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsLessThan filter, therefore execution will take longer to perform this filter.");
            return null;
        }
        if (null == filter) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 60 with FilterPredicate

use of org.apache.parquet.filter2.predicate.FilterPredicate in project hive by apache.

the class FilterPredicateLeafBuilder method buildPredicate.

/**
 * Build filter predicate with multiple constants
 *
 * @param op         IN or BETWEEN
 * @param literals
 * @param columnName
 * @param columnType
 * @return
 */
public FilterPredicate buildPredicate(PredicateLeaf.Operator op, List<Object> literals, String columnName, TypeInfo columnType) throws Exception {
    FilterPredicate result = null;
    switch(op) {
        case IN:
            for (Object literal : literals) {
                if (result == null) {
                    result = buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName, columnType);
                } else {
                    result = or(result, buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName, columnType));
                }
            }
            return result;
        case BETWEEN:
            if (literals.size() != 2) {
                throw new RuntimeException("Not able to build 'between' operation filter with " + literals + " which needs two literals");
            }
            Object min = literals.get(0);
            Object max = literals.get(1);
            FilterPredicate lt = not(buildPredict(PredicateLeaf.Operator.LESS_THAN, min, columnName, columnType));
            FilterPredicate gt = buildPredict(PredicateLeaf.Operator.LESS_THAN_EQUALS, max, columnName, columnType);
            result = FilterApi.and(gt, lt);
            return result;
        default:
            throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op);
    }
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Aggregations

FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)76 Test (org.junit.Test)50 HashMap (java.util.HashMap)33 MessageType (org.apache.parquet.schema.MessageType)33 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)32 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)25 HiveChar (org.apache.hadoop.hive.common.type.HiveChar)12 BinaryColumn (org.apache.parquet.filter2.predicate.Operators.BinaryColumn)8 ArrayList (java.util.ArrayList)5 List (java.util.List)5 Group (org.apache.parquet.example.data.Group)5 Configuration (org.apache.hadoop.conf.Configuration)4 HiveVarchar (org.apache.hadoop.hive.common.type.HiveVarchar)4 User (org.apache.parquet.filter2.recordlevel.PhoneBookWriter.User)4 Predicate (java.util.function.Predicate)3 Path (org.apache.hadoop.fs.Path)3 Pair (uk.gov.gchq.gaffer.commonutil.pair.Pair)3 TupleAdaptedPredicate (uk.gov.gchq.koryphe.tuple.predicate.TupleAdaptedPredicate)3 HashSet (java.util.HashSet)2 GenericRecord (org.apache.avro.generic.GenericRecord)2