use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class VectorizedParquetRecordReader method initialize.
public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
jobConf = configuration;
ParquetMetadata footer;
List<BlockMetaData> blocks;
boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
String columnNames = configuration.get(IOConstants.COLUMNS);
columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
// if task.side.metadata is set, rowGroupOffsets is null
if (rowGroupOffsets == null) {
//TODO check whether rowGroupOffSets can be null
// then we need to apply the predicate push down filter
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(configuration);
blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
footer = readFooter(configuration, file, NO_FILTER);
Set<Long> offsets = new HashSet<>();
for (long offset : rowGroupOffsets) {
offsets.add(offset);
}
blocks = new ArrayList<>();
for (BlockMetaData block : footer.getBlocks()) {
if (offsets.contains(block.getStartingPos())) {
blocks.add(block);
}
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {
long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
for (int i = 0; i < foundRowGroupOffsets.length; i++) {
foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
}
// provide a good error message in case there's a bug
throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
}
}
for (BlockMetaData block : blocks) {
this.totalRowCount += block.getRowCount();
}
this.fileSchema = footer.getFileMetaData().getSchema();
MessageType tableSchema;
if (indexAccess) {
List<Integer> indexSequence = new ArrayList<>();
// Generates a sequence list of indexes
for (int i = 0; i < columnNamesList.size(); i++) {
indexSequence.add(i);
}
tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
} else {
tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
}
indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
} else {
requestedSchema = fileSchema;
}
this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class FilterPredicateLeafBuilder method buildPredicate.
/**
* Build filter predicate with multiple constants
*
* @param op IN or BETWEEN
* @param literals
* @param columnName
* @return
*/
public FilterPredicate buildPredicate(PredicateLeaf.Operator op, List<Object> literals, String columnName) throws Exception {
FilterPredicate result = null;
switch(op) {
case IN:
for (Object literal : literals) {
if (result == null) {
result = buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName);
} else {
result = or(result, buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName));
}
}
return result;
case BETWEEN:
if (literals.size() != 2) {
throw new RuntimeException("Not able to build 'between' operation filter with " + literals + " which needs two literals");
}
Object min = literals.get(0);
Object max = literals.get(1);
FilterPredicate lt = not(buildPredict(PredicateLeaf.Operator.LESS_THAN, min, columnName));
FilterPredicate gt = buildPredict(PredicateLeaf.Operator.LESS_THAN_EQUALS, max, columnName);
result = FilterApi.and(gt, lt);
return result;
default:
throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op);
}
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.
the class VectorizedParquetRecordReader method initialize.
@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException {
// the oldSplit may be null during the split phase
if (oldSplit == null) {
return;
}
ParquetMetadata footer;
List<BlockMetaData> blocks;
ParquetInputSplit split = (ParquetInputSplit) oldSplit;
boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
String columnNames = configuration.get(IOConstants.COLUMNS);
columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
// if task.side.metadata is set, rowGroupOffsets is null
Object cacheKey = null;
String cacheTag = null;
// TODO: also support fileKey in splits, like OrcSplit does
if (metadataCache != null) {
cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID));
}
if (cacheKey != null) {
if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
cacheTag = LlapUtil.getDbAndTableNameForMetrics(file, true);
}
// If we are going to use cache, change the path to depend on file ID for extra consistency.
FileSystem fs = file.getFileSystem(configuration);
if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
file = HdfsUtils.getFileIdPath(fs, file, (long) cacheKey);
}
}
if (rowGroupOffsets == null) {
// TODO check whether rowGroupOffSets can be null
// then we need to apply the predicate push down filter
footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(configuration);
blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
Set<Long> offsets = new HashSet<>();
for (long offset : rowGroupOffsets) {
offsets.add(offset);
}
blocks = new ArrayList<>();
for (BlockMetaData block : footer.getBlocks()) {
if (offsets.contains(block.getStartingPos())) {
blocks.add(block);
}
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {
long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
for (int i = 0; i < foundRowGroupOffsets.length; i++) {
foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
}
// provide a good error message in case there's a bug
throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
}
}
for (BlockMetaData block : blocks) {
this.totalRowCount += block.getRowCount();
}
this.fileSchema = footer.getFileMetaData().getSchema();
colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project parquet-mr by apache.
the class RowGroupFilter method visit.
@Override
public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) {
FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate();
// check that the schema of the filter matches the schema of the file
SchemaCompatibilityValidator.validate(filterPredicate, schema);
List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>();
for (BlockMetaData block : blocks) {
boolean drop = false;
if (levels.contains(FilterLevel.STATISTICS)) {
drop = StatisticsFilter.canDrop(filterPredicate, block.getColumns());
}
if (!drop && levels.contains(FilterLevel.DICTIONARY)) {
drop = DictionaryFilter.canDrop(filterPredicate, block.getColumns(), reader.getDictionaryReader(block));
}
if (!drop) {
filteredBlocks.add(block);
}
}
return filteredBlocks;
}
use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project parquet-mr by apache.
the class TestInputFormat method testGetFilter.
@Test
public void testGetFilter() throws IOException {
IntColumn intColumn = intColumn("foo");
FilterPredicate p = or(eq(intColumn, 7), eq(intColumn, 12));
Configuration conf = new Configuration();
ParquetInputFormat.setFilterPredicate(conf, p);
Filter read = ParquetInputFormat.getFilter(conf);
assertTrue(read instanceof FilterPredicateCompat);
assertEquals(p, ((FilterPredicateCompat) read).getFilterPredicate());
conf = new Configuration();
ParquetInputFormat.setFilterPredicate(conf, not(p));
read = ParquetInputFormat.getFilter(conf);
assertTrue(read instanceof FilterPredicateCompat);
assertEquals(and(notEq(intColumn, 7), notEq(intColumn, 12)), ((FilterPredicateCompat) read).getFilterPredicate());
assertEquals(FilterCompat.NOOP, ParquetInputFormat.getFilter(new Configuration()));
}
Aggregations