use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.
the class JavaPredicateToParquetPredicate method getIsMoreThanFilter.
private FilterPredicate getIsMoreThanFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
String[] paths = schemaUtils.getPaths(group, colName);
if (null == paths) {
paths = new String[1];
paths[0] = colName;
}
FilterPredicate filter = null;
for (int i = 0; i < paths.length; i++) {
final String path = paths[i];
FilterPredicate tempFilter;
if (parquetObjects[i] instanceof String) {
tempFilter = gt(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
} else if (parquetObjects[i] instanceof Double) {
tempFilter = gt(doubleColumn(path), (Double) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Float) {
tempFilter = gt(floatColumn(path), (Float) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Integer) {
tempFilter = gt(intColumn(path), (Integer) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Long) {
tempFilter = gt(longColumn(path), (Long) parquetObjects[i]);
} else if (parquetObjects[i] instanceof java.util.Date) {
tempFilter = gt(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
} else if (parquetObjects[i] instanceof java.sql.Date) {
tempFilter = gt(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
} else if (parquetObjects[i] instanceof Short) {
tempFilter = gt(intColumn(path), ((Short) parquetObjects[i]).intValue());
} else if (parquetObjects[i] instanceof byte[]) {
tempFilter = gt(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
} else {
fullyApplied = false;
LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsMoreThan filter, therefore execution will take longer to perform this filter.");
return null;
}
if (null == filter) {
filter = tempFilter;
} else {
filter = and(filter, tempFilter);
}
}
return filter;
}
use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.
the class JavaPredicateToParquetPredicate method getAndFilter.
public FilterPredicate getAndFilter(final List<Predicate> predicateList, final String[] selection, final String group, final SchemaUtils schemaUtils) throws SerialisationException {
FilterPredicate combinedFilter = null;
for (final Predicate predicate : predicateList) {
final Predicate filterFunction;
final String[] newSelection;
if (predicate instanceof TupleAdaptedPredicate) {
filterFunction = ((TupleAdaptedPredicate) predicate).getPredicate();
// Build new selections
final Integer[] ints = (Integer[]) ((TupleAdaptedPredicate) predicate).getSelection();
newSelection = new String[ints.length];
for (int x = 0; x < ints.length; x++) {
newSelection[x] = selection[ints[x]];
}
} else {
filterFunction = predicate;
newSelection = selection;
}
final JavaPredicateToParquetPredicate predicateConverter = new JavaPredicateToParquetPredicate(schemaUtils, filterFunction, newSelection, group);
final FilterPredicate parquetPredicate = predicateConverter.getParquetPredicate();
if (!predicateConverter.fullyApplied) {
fullyApplied = false;
}
combinedFilter = FilterPredicateUtils.and(combinedFilter, parquetPredicate);
}
return combinedFilter;
}
use of org.apache.parquet.filter2.predicate.FilterPredicate in project Gaffer by gchq.
the class JavaPredicateToParquetPredicate method getIsMoreThanOrEqualToFilter.
private FilterPredicate getIsMoreThanOrEqualToFilter(final String colName, final Object[] parquetObjects, final String group, final SchemaUtils schemaUtils) {
String[] paths = schemaUtils.getPaths(group, colName);
if (null == paths) {
paths = new String[1];
paths[0] = colName;
}
FilterPredicate filter = null;
for (int i = 0; i < paths.length; i++) {
final String path = paths[i];
FilterPredicate tempFilter;
if (parquetObjects[i] instanceof String) {
tempFilter = gtEq(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
} else if (parquetObjects[i] instanceof Double) {
tempFilter = gtEq(doubleColumn(path), (Double) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Float) {
tempFilter = gtEq(floatColumn(path), (Float) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Integer) {
tempFilter = gtEq(intColumn(path), (Integer) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Long) {
tempFilter = gtEq(longColumn(path), (Long) parquetObjects[i]);
} else if (parquetObjects[i] instanceof java.util.Date) {
tempFilter = gtEq(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
} else if (parquetObjects[i] instanceof java.sql.Date) {
tempFilter = gtEq(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
} else if (parquetObjects[i] instanceof Short) {
tempFilter = gtEq(intColumn(path), ((Short) parquetObjects[i]).intValue());
} else if (parquetObjects[i] instanceof byte[]) {
tempFilter = gtEq(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
} else {
fullyApplied = false;
LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsMoreThanOrEqualTo filter, therefore execution will take longer to perform this filter.");
return null;
}
if (null == filter) {
filter = tempFilter;
} else {
filter = and(filter, tempFilter);
}
}
return filter;
}
use of org.apache.parquet.filter2.predicate.FilterPredicate in project beam by apache.
the class ParquetIOTest method testWriteAndReadFilesAsJsonForUnknownSchemaWithConfiguration.
@Test
public void testWriteAndReadFilesAsJsonForUnknownSchemaWithConfiguration() {
List<GenericRecord> records = generateGenericRecords(10);
List<GenericRecord> expectedRecords = generateGenericRecords(1);
mainPipeline.apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA))).apply(FileIO.<GenericRecord>write().via(ParquetIO.sink(SCHEMA)).to(temporaryFolder.getRoot().getAbsolutePath()));
mainPipeline.run().waitUntilFinish();
Configuration configuration = new Configuration();
FilterPredicate filterPredicate = FilterApi.eq(FilterApi.binaryColumn("id"), Binary.fromString("0"));
ParquetInputFormat.setFilterPredicate(configuration, filterPredicate);
PCollection<String> readBackAsJson = readPipeline.apply(ParquetIO.parseGenericRecords(ParseGenericRecordAsJsonFn.create()).withConfiguration(configuration).from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));
PAssert.that(readBackAsJson).containsInAnyOrder(convertRecordsToJson(expectedRecords));
readPipeline.run().waitUntilFinish();
}
use of org.apache.parquet.filter2.predicate.FilterPredicate in project beam by apache.
the class ParquetIOTest method testWriteAndReadWithConfiguration.
@Test
public void testWriteAndReadWithConfiguration() {
List<GenericRecord> records = generateGenericRecords(10);
List<GenericRecord> expectedRecords = generateGenericRecords(1);
mainPipeline.apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA))).apply(FileIO.<GenericRecord>write().via(ParquetIO.sink(SCHEMA)).to(temporaryFolder.getRoot().getAbsolutePath()));
mainPipeline.run().waitUntilFinish();
Configuration configuration = new Configuration();
FilterPredicate filterPredicate = FilterApi.eq(FilterApi.binaryColumn("id"), Binary.fromString("0"));
ParquetInputFormat.setFilterPredicate(configuration, filterPredicate);
PCollection<GenericRecord> readBack = readPipeline.apply(ParquetIO.read(SCHEMA).from(temporaryFolder.getRoot().getAbsolutePath() + "/*").withConfiguration(configuration).withSplit());
PAssert.that(readBack).containsInAnyOrder(expectedRecords);
readPipeline.run().waitUntilFinish();
}
Aggregations