use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project parquet-mr by apache.
the class ClientSideMetadataSplitStrategy method getSplits.
List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers, long maxSplitSize, long minSplitSize, ReadContext readContext) throws IOException {
List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
Filter filter = ParquetInputFormat.getFilter(configuration);
long rowGroupsDropped = 0;
long totalRowGroups = 0;
for (Footer footer : footers) {
final Path file = footer.getFile();
LOG.debug("{}", file);
FileSystem fs = file.getFileSystem(configuration);
FileStatus fileStatus = fs.getFileStatus(file);
ParquetMetadata parquetMetaData = footer.getParquetMetadata();
List<BlockMetaData> blocks = parquetMetaData.getBlocks();
List<BlockMetaData> filteredBlocks;
totalRowGroups += blocks.size();
filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
rowGroupsDropped += blocks.size() - filteredBlocks.size();
if (filteredBlocks.isEmpty()) {
continue;
}
BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
splits.addAll(generateSplits(filteredBlocks, fileBlockLocations, fileStatus, readContext.getRequestedSchema().toString(), readContext.getReadSupportMetadata(), minSplitSize, maxSplitSize));
}
if (rowGroupsDropped > 0 && totalRowGroups > 0) {
int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped);
} else {
LOG.info("There were no row groups that could be dropped due to filter predicates");
}
return splits;
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class TestParquetFilterPredicate method testFilterColumnsThatDoNoExistOnSchema.
@Test
public void testFilterColumnsThatDoNoExistOnSchema() {
MessageType schema = MessageTypeParser.parseMessageType("message test { required int32 a; required binary stinger; }");
SearchArgument sarg = SearchArgumentFactory.newBuilder().startNot().startOr().isNull("a", PredicateLeaf.Type.LONG).between("y", PredicateLeaf.Type.LONG, 10L, // Column will be removed from filter
20L).in("z", PredicateLeaf.Type.LONG, 1L, 2L, // Column will be removed from filter
3L).nullSafeEquals("a", PredicateLeaf.Type.STRING, "stinger").end().end().build();
FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
String expected = "and(not(eq(a, null)), not(eq(a, Binary{\"stinger\"})))";
assertEquals(expected, p.toString());
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class ParquetRecordReaderBase method setFilter.
public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) {
SearchArgument sarg = ConvertAstToSearchArg.createFromConf(conf);
if (sarg == null) {
return null;
}
// Create the Parquet FilterPredicate without including columns that do not exist
// on the schema (such as partition columns).
FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
if (p != null) {
// Filter may have sensitive information. Do not send to debug.
LOG.debug("PARQUET predicate push down generated.");
ParquetInputFormat.setFilterPredicate(conf, p);
return FilterCompat.get(p);
} else {
// Filter may have sensitive information. Do not send to debug.
LOG.debug("No PARQUET predicate push down is generated.");
return null;
}
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class ParquetRecordReaderBase method getSplit.
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
@SuppressWarnings("deprecation")
protected ParquetInputSplit getSplit(final org.apache.hadoop.mapred.InputSplit oldSplit, final JobConf conf) throws IOException {
ParquetInputSplit split;
if (oldSplit instanceof FileSplit) {
final Path finalPath = ((FileSplit) oldSplit).getPath();
jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());
// TODO enable MetadataFilter by using readFooter(Configuration configuration, Path file,
// MetadataFilter filter) API
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadSupport.ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema()));
// Compute stats
for (BlockMetaData bmd : blocks) {
serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount());
serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize());
}
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
final long splitStart = ((FileSplit) oldSplit).getStart();
final long splitLength = ((FileSplit) oldSplit).getLength();
for (final BlockMetaData block : blocks) {
final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
splitGroup.add(block);
}
}
if (splitGroup.isEmpty()) {
LOG.warn("Skipping split, could not find row group in: " + oldSplit);
return null;
}
FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
if (filter != null) {
filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
if (filtedBlocks.isEmpty()) {
LOG.debug("All row groups are dropped due to filter predicates");
return null;
}
long droppedBlocks = splitGroup.size() - filtedBlocks.size();
if (droppedBlocks > 0) {
LOG.debug("Dropping " + droppedBlocks + " row groups that do not pass filter predicate");
}
} else {
filtedBlocks = splitGroup;
}
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION)) {
skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr");
}
split = new ParquetInputSplit(finalPath, splitStart, splitLength, oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
return split;
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project parquet-mr by apache.
the class ParquetLoader method buildFilter.
private FilterPredicate buildFilter(Expression e) {
OpType op = e.getOpType();
if (e instanceof BinaryExpression) {
Expression lhs = ((BinaryExpression) e).getLhs();
Expression rhs = ((BinaryExpression) e).getRhs();
switch(op) {
case OP_AND:
return and(buildFilter(lhs), buildFilter(rhs));
case OP_OR:
return or(buildFilter(lhs), buildFilter(rhs));
case OP_BETWEEN:
BetweenExpression between = (BetweenExpression) rhs;
return and(buildFilter(OpType.OP_GE, (Column) lhs, (Const) between.getLower()), buildFilter(OpType.OP_LE, (Column) lhs, (Const) between.getUpper()));
case OP_IN:
FilterPredicate current = null;
for (Object value : ((InExpression) rhs).getValues()) {
FilterPredicate next = buildFilter(OpType.OP_EQ, (Column) lhs, (Const) value);
if (current != null) {
current = or(current, next);
} else {
current = next;
}
}
return current;
}
if (lhs instanceof Column && rhs instanceof Const) {
return buildFilter(op, (Column) lhs, (Const) rhs);
} else if (lhs instanceof Const && rhs instanceof Column) {
return buildFilter(op, (Column) rhs, (Const) lhs);
}
} else if (e instanceof UnaryExpression && op == OpType.OP_NOT) {
return LogicalInverseRewriter.rewrite(not(buildFilter(((UnaryExpression) e).getExpression())));
}
throw new RuntimeException("Could not build filter for expression: " + e);
}
Aggregations