Search in sources :

Example 26 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project Gaffer by gchq.

the class QueryGenerator method getPathsAndFiltersForGetElements.

private ParquetQuery getPathsAndFiltersForGetElements(final GetElements getElements) throws SerialisationException, OperationException {
    final Iterable<? extends ElementId> seeds = getElements.getInput();
    if (null == seeds || !seeds.iterator().hasNext()) {
        return new ParquetQuery();
    }
    // Stage 1: Use the view to identify all groups that might contain data
    final Set<String> allRelevantGroups = getRelevantGroups(getElements.getView());
    // Stage 2: For each of the above groups, create a Parquet predicate from the view and directedType
    final Map<String, Pair<FilterPredicate, Boolean>> groupToPredicate = new HashMap<>();
    for (final String group : allRelevantGroups) {
        Pair<FilterPredicate, Boolean> filter = getPredicateFromView(getElements.getView(), group, schemaUtils.getEntityGroups().contains(group));
        if (schemaUtils.getEdgeGroups().contains(group)) {
            final FilterPredicate directedTypeFilter = getPredicateFromDirectedType(getElements.getDirectedType());
            filter.setFirst(FilterPredicateUtils.and(filter.getFirst(), directedTypeFilter));
        }
        groupToPredicate.put(group, filter);
    }
    // Stage 3: Convert seeds to ParquetElementSeeds and create Stream of <group, ParquetElementSeed> pairs where
    // each seed appears once for each of the relevant groups
    final Stream<Pair<String, ParquetElementSeed>> groupAndSeeds = StreamSupport.stream(seeds.spliterator(), false).flatMap(seed -> {
        try {
            return seedToParquetObject(seed, allRelevantGroups).stream();
        } catch (final SerialisationException e) {
            throw new RuntimeException("SerialisationException converting seed into a Parquet object", e);
        }
    });
    // Stage 4: Convert stream of <group, ParquetElementSeed> pars to stream of tuples
    // <group, ParquetElementSeed, List<PathInfo>>
    final Stream<Tuple3<String, ParquetElementSeed, Set<PathInfo>>> groupSeedsAndPaths = groupAndSeeds.map(pair -> getRelevantFiles(pair.getFirst(), pair.getSecond()));
    // Stage 5: Create map from path to list of <group, reversed edge flag, Parquet seeds>
    // TODO: Currently this consumes the entire stream - need to do this in batches
    final List<Tuple3<String, ParquetElementSeed, Set<PathInfo>>> groupSeedsAndPathsList = groupSeedsAndPaths.collect(Collectors.toList());
    final Map<PathInfo, List<Tuple3<String, Boolean, ParquetElementSeed>>> pathToSeeds = new HashMap<>();
    for (final Tuple3<String, ParquetElementSeed, Set<PathInfo>> tuple : groupSeedsAndPathsList) {
        Set<PathInfo> paths = tuple.get2();
        for (final PathInfo pathInfo : paths) {
            if (!pathToSeeds.containsKey(pathInfo)) {
                pathToSeeds.put(pathInfo, new ArrayList<>());
            }
            pathToSeeds.get(pathInfo).add(new Tuple3<>(tuple.get0(), pathInfo.isReversed(), tuple.get1()));
        }
    }
    // Stage 6: Create ParquetQuery
    final SeededGraphFilters.IncludeIncomingOutgoingType includeIncomingOutgoingType = getElements.getIncludeIncomingOutGoing();
    final SeedMatching.SeedMatchingType seedMatchingType = getElements.getSeedMatching();
    final ParquetQuery parquetQuery = new ParquetQuery();
    for (final PathInfo pathInfo : pathToSeeds.keySet()) {
        List<Tuple3<String, Boolean, ParquetElementSeed>> seedList = pathToSeeds.get(pathInfo);
        FilterPredicate filterPredicate = seedsToPredicate(seedList, includeIncomingOutgoingType, seedMatchingType);
        if (null != filterPredicate) {
            final String group = pathInfo.getGroup();
            final Pair<FilterPredicate, Boolean> viewFilterPredicate = groupToPredicate.get(group);
            if (null != viewFilterPredicate) {
                // Put view predicate first as filter for checking whether it matches one of many seeds could be complex
                filterPredicate = FilterPredicateUtils.and(viewFilterPredicate.getFirst(), filterPredicate);
            }
            final ParquetFileQuery fileQuery = new ParquetFileQuery(pathInfo.getPath(), filterPredicate, viewFilterPredicate.getSecond());
            parquetQuery.add(group, fileQuery);
        }
    }
    LOGGER.info("Created ParquetQuery of {}", parquetQuery);
    return parquetQuery;
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) List(java.util.List) Pair(uk.gov.gchq.gaffer.commonutil.pair.Pair) SerialisationException(uk.gov.gchq.gaffer.exception.SerialisationException) Tuple3(uk.gov.gchq.koryphe.tuple.n.Tuple3) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) SeededGraphFilters(uk.gov.gchq.gaffer.operation.graph.SeededGraphFilters) SeedMatching(uk.gov.gchq.gaffer.operation.SeedMatching)

Example 27 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getAgeOffPredicate.

public FilterPredicate getAgeOffPredicate(final AgeOff ageOff, final String[] selection, final String group, final SchemaUtils schemaUtils) {
    String[] paths = schemaUtils.getPaths(group, selection[0]);
    if (paths == null) {
        paths = new String[1];
        paths[0] = selection[0];
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        Long ageOffTime = System.currentTimeMillis() - ageOff.getAgeOffTime();
        tempFilter = gt(longColumn(path), ageOffTime);
        if (filter == null) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 28 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project Gaffer by gchq.

the class JavaPredicateToParquetPredicate method getIsLessThanFilter.

private FilterPredicate getIsLessThanFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
    String[] paths = schemaUtils.getPaths(group, colName);
    if (null == paths) {
        paths = new String[1];
        paths[0] = colName;
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        if (parquetObjects[i] instanceof String) {
            tempFilter = lt(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
        } else if (parquetObjects[i] instanceof Double) {
            tempFilter = lt(doubleColumn(path), (Double) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Float) {
            tempFilter = lt(floatColumn(path), (Float) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Integer) {
            tempFilter = lt(intColumn(path), (Integer) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Long) {
            tempFilter = lt(longColumn(path), (Long) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof java.util.Date) {
            tempFilter = lt(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof java.sql.Date) {
            tempFilter = lt(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof Short) {
            tempFilter = lt(intColumn(path), ((Short) parquetObjects[i]).intValue());
        } else if (parquetObjects[i] instanceof byte[]) {
            tempFilter = lt(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
        } else {
            fullyApplied = false;
            LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsLessThan filter, therefore execution will take longer to perform this filter.");
            return null;
        }
        if (null == filter) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 29 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project flink by apache.

the class ParquetVectorizedInputFormat method createReader.

@Override
public ParquetReader createReader(final Configuration config, final SplitT split) throws IOException {
    final Path filePath = split.path();
    final long splitOffset = split.offset();
    final long splitLength = split.length();
    org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(filePath.toUri());
    ParquetMetadata footer = readFooter(hadoopConfig.conf(), hadoopPath, range(splitOffset, splitOffset + splitLength));
    MessageType fileSchema = footer.getFileMetaData().getSchema();
    FilterCompat.Filter filter = getFilter(hadoopConfig.conf());
    List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    MessageType requestedSchema = clipParquetSchema(fileSchema);
    ParquetFileReader reader = new ParquetFileReader(hadoopConfig.conf(), footer.getFileMetaData(), hadoopPath, blocks, requestedSchema.getColumns());
    long totalRowCount = 0;
    for (BlockMetaData block : blocks) {
        totalRowCount += block.getRowCount();
    }
    checkSchema(fileSchema, requestedSchema);
    final Pool<ParquetReaderBatch<T>> poolOfBatches = createPoolOfBatches(split, requestedSchema, numBatchesToCirculate(config));
    return new ParquetReader(reader, requestedSchema, totalRowCount, poolOfBatches);
}
Also used : Path(org.apache.flink.core.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)24 ArrayList (java.util.ArrayList)7 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)6 Path (org.apache.hadoop.fs.Path)5 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)5 MessageType (org.apache.parquet.schema.MessageType)5 HashMap (java.util.HashMap)4 FilterCompat (org.apache.parquet.filter2.compat.FilterCompat)4 HashSet (java.util.HashSet)3 List (java.util.List)3 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)3 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)3 Pair (uk.gov.gchq.gaffer.commonutil.pair.Pair)3 FileSystem (org.apache.hadoop.fs.FileSystem)2 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)2 UnboundRecordFilter (org.apache.parquet.filter.UnboundRecordFilter)2 Filter (org.apache.parquet.filter2.compat.FilterCompat.Filter)2 FilterPredicateCompat (org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat)2 ParquetInputSplit (org.apache.parquet.hadoop.ParquetInputSplit)2 Test (org.junit.Test)2