use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project Gaffer by gchq.
the class QueryGenerator method getPathsAndFiltersForGetElements.
private ParquetQuery getPathsAndFiltersForGetElements(final GetElements getElements) throws SerialisationException, OperationException {
final Iterable<? extends ElementId> seeds = getElements.getInput();
if (null == seeds || !seeds.iterator().hasNext()) {
return new ParquetQuery();
}
// Stage 1: Use the view to identify all groups that might contain data
final Set<String> allRelevantGroups = getRelevantGroups(getElements.getView());
// Stage 2: For each of the above groups, create a Parquet predicate from the view and directedType
final Map<String, Pair<FilterPredicate, Boolean>> groupToPredicate = new HashMap<>();
for (final String group : allRelevantGroups) {
Pair<FilterPredicate, Boolean> filter = getPredicateFromView(getElements.getView(), group, schemaUtils.getEntityGroups().contains(group));
if (schemaUtils.getEdgeGroups().contains(group)) {
final FilterPredicate directedTypeFilter = getPredicateFromDirectedType(getElements.getDirectedType());
filter.setFirst(FilterPredicateUtils.and(filter.getFirst(), directedTypeFilter));
}
groupToPredicate.put(group, filter);
}
// Stage 3: Convert seeds to ParquetElementSeeds and create Stream of <group, ParquetElementSeed> pairs where
// each seed appears once for each of the relevant groups
final Stream<Pair<String, ParquetElementSeed>> groupAndSeeds = StreamSupport.stream(seeds.spliterator(), false).flatMap(seed -> {
try {
return seedToParquetObject(seed, allRelevantGroups).stream();
} catch (final SerialisationException e) {
throw new RuntimeException("SerialisationException converting seed into a Parquet object", e);
}
});
// Stage 4: Convert stream of <group, ParquetElementSeed> pars to stream of tuples
// <group, ParquetElementSeed, List<PathInfo>>
final Stream<Tuple3<String, ParquetElementSeed, Set<PathInfo>>> groupSeedsAndPaths = groupAndSeeds.map(pair -> getRelevantFiles(pair.getFirst(), pair.getSecond()));
// Stage 5: Create map from path to list of <group, reversed edge flag, Parquet seeds>
// TODO: Currently this consumes the entire stream - need to do this in batches
final List<Tuple3<String, ParquetElementSeed, Set<PathInfo>>> groupSeedsAndPathsList = groupSeedsAndPaths.collect(Collectors.toList());
final Map<PathInfo, List<Tuple3<String, Boolean, ParquetElementSeed>>> pathToSeeds = new HashMap<>();
for (final Tuple3<String, ParquetElementSeed, Set<PathInfo>> tuple : groupSeedsAndPathsList) {
Set<PathInfo> paths = tuple.get2();
for (final PathInfo pathInfo : paths) {
if (!pathToSeeds.containsKey(pathInfo)) {
pathToSeeds.put(pathInfo, new ArrayList<>());
}
pathToSeeds.get(pathInfo).add(new Tuple3<>(tuple.get0(), pathInfo.isReversed(), tuple.get1()));
}
}
// Stage 6: Create ParquetQuery
final SeededGraphFilters.IncludeIncomingOutgoingType includeIncomingOutgoingType = getElements.getIncludeIncomingOutGoing();
final SeedMatching.SeedMatchingType seedMatchingType = getElements.getSeedMatching();
final ParquetQuery parquetQuery = new ParquetQuery();
for (final PathInfo pathInfo : pathToSeeds.keySet()) {
List<Tuple3<String, Boolean, ParquetElementSeed>> seedList = pathToSeeds.get(pathInfo);
FilterPredicate filterPredicate = seedsToPredicate(seedList, includeIncomingOutgoingType, seedMatchingType);
if (null != filterPredicate) {
final String group = pathInfo.getGroup();
final Pair<FilterPredicate, Boolean> viewFilterPredicate = groupToPredicate.get(group);
if (null != viewFilterPredicate) {
// Put view predicate first as filter for checking whether it matches one of many seeds could be complex
filterPredicate = FilterPredicateUtils.and(viewFilterPredicate.getFirst(), filterPredicate);
}
final ParquetFileQuery fileQuery = new ParquetFileQuery(pathInfo.getPath(), filterPredicate, viewFilterPredicate.getSecond());
parquetQuery.add(group, fileQuery);
}
}
LOGGER.info("Created ParquetQuery of {}", parquetQuery);
return parquetQuery;
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project Gaffer by gchq.
the class JavaPredicateToParquetPredicate method getAgeOffPredicate.
public FilterPredicate getAgeOffPredicate(final AgeOff ageOff, final String[] selection, final String group, final SchemaUtils schemaUtils) {
String[] paths = schemaUtils.getPaths(group, selection[0]);
if (paths == null) {
paths = new String[1];
paths[0] = selection[0];
}
FilterPredicate filter = null;
for (int i = 0; i < paths.length; i++) {
final String path = paths[i];
FilterPredicate tempFilter;
Long ageOffTime = System.currentTimeMillis() - ageOff.getAgeOffTime();
tempFilter = gt(longColumn(path), ageOffTime);
if (filter == null) {
filter = tempFilter;
} else {
filter = and(filter, tempFilter);
}
}
return filter;
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project Gaffer by gchq.
the class JavaPredicateToParquetPredicate method getIsLessThanFilter.
private FilterPredicate getIsLessThanFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
String[] paths = schemaUtils.getPaths(group, colName);
if (null == paths) {
paths = new String[1];
paths[0] = colName;
}
FilterPredicate filter = null;
for (int i = 0; i < paths.length; i++) {
final String path = paths[i];
FilterPredicate tempFilter;
if (parquetObjects[i] instanceof String) {
tempFilter = lt(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
} else if (parquetObjects[i] instanceof Double) {
tempFilter = lt(doubleColumn(path), (Double) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Float) {
tempFilter = lt(floatColumn(path), (Float) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Integer) {
tempFilter = lt(intColumn(path), (Integer) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Long) {
tempFilter = lt(longColumn(path), (Long) parquetObjects[i]);
} else if (parquetObjects[i] instanceof java.util.Date) {
tempFilter = lt(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
} else if (parquetObjects[i] instanceof java.sql.Date) {
tempFilter = lt(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
} else if (parquetObjects[i] instanceof Short) {
tempFilter = lt(intColumn(path), ((Short) parquetObjects[i]).intValue());
} else if (parquetObjects[i] instanceof byte[]) {
tempFilter = lt(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
} else {
fullyApplied = false;
LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsLessThan filter, therefore execution will take longer to perform this filter.");
return null;
}
if (null == filter) {
filter = tempFilter;
} else {
filter = and(filter, tempFilter);
}
}
return filter;
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project flink by apache.
the class ParquetVectorizedInputFormat method createReader.
@Override
public ParquetReader createReader(final Configuration config, final SplitT split) throws IOException {
final Path filePath = split.path();
final long splitOffset = split.offset();
final long splitLength = split.length();
org.apache.hadoop.fs.Path hadoopPath = new org.apache.hadoop.fs.Path(filePath.toUri());
ParquetMetadata footer = readFooter(hadoopConfig.conf(), hadoopPath, range(splitOffset, splitOffset + splitLength));
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(hadoopConfig.conf());
List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
MessageType requestedSchema = clipParquetSchema(fileSchema);
ParquetFileReader reader = new ParquetFileReader(hadoopConfig.conf(), footer.getFileMetaData(), hadoopPath, blocks, requestedSchema.getColumns());
long totalRowCount = 0;
for (BlockMetaData block : blocks) {
totalRowCount += block.getRowCount();
}
checkSchema(fileSchema, requestedSchema);
final Pool<ParquetReaderBatch<T>> poolOfBatches = createPoolOfBatches(split, requestedSchema, numBatchesToCirculate(config));
return new ParquetReader(reader, requestedSchema, totalRowCount, poolOfBatches);
}
Aggregations