use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class FilterPredicateLeafBuilder method buildPredicate.
/**
* Build filter predicate with multiple constants
*
* @param op IN or BETWEEN
* @param literals
* @param columnName
* @param columnType
* @return
*/
public FilterPredicate buildPredicate(PredicateLeaf.Operator op, List<Object> literals, String columnName, TypeInfo columnType) throws Exception {
FilterPredicate result = null;
switch(op) {
case IN:
for (Object literal : literals) {
if (result == null) {
result = buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName, columnType);
} else {
result = or(result, buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName, columnType));
}
}
return result;
case BETWEEN:
if (literals.size() != 2) {
throw new RuntimeException("Not able to build 'between' operation filter with " + literals + " which needs two literals");
}
Object min = literals.get(0);
Object max = literals.get(1);
FilterPredicate lt = not(buildPredict(PredicateLeaf.Operator.LESS_THAN, min, columnName, columnType));
FilterPredicate gt = buildPredict(PredicateLeaf.Operator.LESS_THAN_EQUALS, max, columnName, columnType);
result = FilterApi.and(gt, lt);
return result;
default:
throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op);
}
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class ParquetRecordReaderBase method getSplit.
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
@SuppressWarnings("deprecation")
protected ParquetInputSplit getSplit(final org.apache.hadoop.mapred.InputSplit oldSplit, final JobConf conf) throws IOException {
if (oldSplit.getLength() == 0) {
return null;
}
ParquetInputSplit split;
if (oldSplit instanceof FileSplit) {
final Path finalPath = ((FileSplit) oldSplit).getPath();
jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());
// TODO enable MetadataFilter by using readFooter(Configuration configuration, Path file,
// MetadataFilter filter) API
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadSupport.ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema()));
// Compute stats
for (BlockMetaData bmd : blocks) {
serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount());
serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize());
}
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
final long splitStart = ((FileSplit) oldSplit).getStart();
final long splitLength = ((FileSplit) oldSplit).getLength();
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
splitGroup.add(block);
}
}
if (splitGroup.isEmpty()) {
LOG.warn("Skipping split, could not find row group in: " + oldSplit);
return null;
}
FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
if (filter != null) {
filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
if (filtedBlocks.isEmpty()) {
LOG.debug("All row groups are dropped due to filter predicates");
return null;
}
long droppedBlocks = splitGroup.size() - filtedBlocks.size();
if (droppedBlocks > 0) {
LOG.debug("Dropping " + droppedBlocks + " row groups that do not pass filter predicate");
}
} else {
filtedBlocks = splitGroup;
}
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION)) {
skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr");
}
skipProlepticConversion = DataWritableReadSupport.getWriterDateProleptic(fileMetaData.getKeyValueMetaData());
if (skipProlepticConversion == null) {
skipProlepticConversion = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT);
}
legacyConversionEnabled = HiveConf.getBoolVar(conf, ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED);
if (fileMetaData.getKeyValueMetaData().containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) {
legacyConversionEnabled = Boolean.parseBoolean(fileMetaData.getKeyValueMetaData().get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY));
}
split = new ParquetInputSplit(finalPath, splitStart, splitLength, oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
return split;
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class ParquetRecordReaderBase method setFilter.
public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) {
SearchArgument sarg = ConvertAstToSearchArg.createFromConf(conf);
if (sarg == null) {
return null;
}
String columnTypes = conf.get(IOConstants.COLUMNS_TYPES);
String columnNames = conf.get(IOConstants.COLUMNS);
List<TypeInfo> columnTypeList = TypeInfoUtils.getTypeInfosFromTypeString(columnTypes);
Map<String, TypeInfo> columns = new HashMap<>();
String[] names = columnNames.split(",");
for (int i = 0; i < names.length; i++) {
columns.put(names[i], columnTypeList.get(i));
}
// Create the Parquet FilterPredicate without including columns that do not exist
// on the schema (such as partition columns).
FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema, columns);
if (p != null) {
// Filter may have sensitive information. Do not send to debug.
LOG.debug("PARQUET predicate push down generated.");
ParquetInputFormat.setFilterPredicate(conf, p);
return FilterCompat.get(p);
} else {
// Filter may have sensitive information. Do not send to debug.
LOG.debug("No PARQUET predicate push down is generated.");
return null;
}
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class TestParquetFilterPredicate method testFilterColumnsThatDoNoExistOnSchema.
@Test
public void testFilterColumnsThatDoNoExistOnSchema() {
MessageType schema = MessageTypeParser.parseMessageType("message test { required int32 a; required binary stinger; }");
SearchArgument sarg = SearchArgumentFactory.newBuilder().startNot().startOr().isNull("a", PredicateLeaf.Type.LONG).between("y", PredicateLeaf.Type.LONG, 10L, // Column will be removed from filter
20L).in("z", PredicateLeaf.Type.LONG, 1L, 2L, // Column will be removed from filter
3L).nullSafeEquals("stinger", PredicateLeaf.Type.STRING, "stinger").end().end().build();
Map<String, TypeInfo> columnTypes = new HashMap<>();
columnTypes.put("a", TypeInfoFactory.getPrimitiveTypeInfo("int"));
columnTypes.put("y", TypeInfoFactory.getPrimitiveTypeInfo("int"));
columnTypes.put("z", TypeInfoFactory.getPrimitiveTypeInfo("int"));
columnTypes.put("stinger", TypeInfoFactory.getPrimitiveTypeInfo("string"));
FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema, columnTypes);
String expected = "and(not(eq(a, null)), not(eq(stinger, Binary{\"stinger\"})))";
assertEquals(expected, p.toString());
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project Gaffer by gchq.
the class QueryGenerator method getIsEqualFilter.
private FilterPredicate getIsEqualFilter(final String colName, final Object[] parquetObjects, final String group) {
String[] paths = schemaUtils.getPaths(group, colName);
if (null == paths) {
paths = new String[1];
paths[0] = colName;
}
FilterPredicate filter = null;
for (int i = 0; i < paths.length; i++) {
final String path = paths[i];
FilterPredicate tempFilter;
if (parquetObjects[i] instanceof String) {
tempFilter = eq(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
} else if (parquetObjects[i] instanceof Boolean) {
tempFilter = eq(booleanColumn(path), (Boolean) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Double) {
tempFilter = eq(doubleColumn(path), (Double) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Float) {
tempFilter = eq(floatColumn(path), (Float) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Integer) {
tempFilter = eq(intColumn(path), (Integer) parquetObjects[i]);
} else if (parquetObjects[i] instanceof Long) {
tempFilter = eq(longColumn(path), (Long) parquetObjects[i]);
} else if (parquetObjects[i] instanceof java.util.Date) {
tempFilter = eq(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
} else if (parquetObjects[i] instanceof java.sql.Date) {
tempFilter = eq(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
} else if (parquetObjects[i] instanceof Short) {
tempFilter = eq(intColumn(path), ((Short) parquetObjects[i]).intValue());
} else if (parquetObjects[i] instanceof byte[]) {
tempFilter = eq(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
} else {
LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsEqual filter, therefore execution will take longer to perform this filter.");
return null;
}
if (null == filter) {
filter = tempFilter;
} else {
filter = and(filter, tempFilter);
}
}
return filter;
}
Aggregations