Search in sources :

Example 1 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
    jobConf = configuration;
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    if (rowGroupOffsets == null) {
        //TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readFooter(configuration, file, NO_FILTER);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    MessageType tableSchema;
    if (indexAccess) {
        List<Integer> indexSequence = new ArrayList<>();
        // Generates a sequence list of indexes
        for (int i = 0; i < columnNamesList.size(); i++) {
            indexSequence.add(i);
        }
        tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
    } else {
        tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
    }
    indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
    if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
        requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
    } else {
        requestedSchema = fileSchema;
    }
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 2 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.

the class FilterPredicateLeafBuilder method buildPredicate.

/**
 * Build filter predicate with multiple constants
 *
 * @param op         IN or BETWEEN
 * @param literals
 * @param columnName
 * @return
 */
public FilterPredicate buildPredicate(PredicateLeaf.Operator op, List<Object> literals, String columnName) throws Exception {
    FilterPredicate result = null;
    switch(op) {
        case IN:
            for (Object literal : literals) {
                if (result == null) {
                    result = buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName);
                } else {
                    result = or(result, buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName));
                }
            }
            return result;
        case BETWEEN:
            if (literals.size() != 2) {
                throw new RuntimeException("Not able to build 'between' operation filter with " + literals + " which needs two literals");
            }
            Object min = literals.get(0);
            Object max = literals.get(1);
            FilterPredicate lt = not(buildPredict(PredicateLeaf.Operator.LESS_THAN, min, columnName));
            FilterPredicate gt = buildPredict(PredicateLeaf.Operator.LESS_THAN_EQUALS, max, columnName);
            result = FilterApi.and(gt, lt);
            return result;
        default:
            throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op);
    }
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 3 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException {
    // the oldSplit may be null during the split phase
    if (oldSplit == null) {
        return;
    }
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    ParquetInputSplit split = (ParquetInputSplit) oldSplit;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    Object cacheKey = null;
    String cacheTag = null;
    // TODO: also support fileKey in splits, like OrcSplit does
    if (metadataCache != null) {
        cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID));
    }
    if (cacheKey != null) {
        if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
            cacheTag = LlapUtil.getDbAndTableNameForMetrics(file, true);
        }
        // If we are going to use cache, change the path to depend on file ID for extra consistency.
        FileSystem fs = file.getFileSystem(configuration);
        if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
            file = HdfsUtils.getFileIdPath(fs, file, (long) cacheKey);
        }
    }
    if (rowGroupOffsets == null) {
        // TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
    requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
    Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 4 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project parquet-mr by apache.

the class RowGroupFilter method visit.

@Override
public List<BlockMetaData> visit(FilterCompat.FilterPredicateCompat filterPredicateCompat) {
    FilterPredicate filterPredicate = filterPredicateCompat.getFilterPredicate();
    // check that the schema of the filter matches the schema of the file
    SchemaCompatibilityValidator.validate(filterPredicate, schema);
    List<BlockMetaData> filteredBlocks = new ArrayList<BlockMetaData>();
    for (BlockMetaData block : blocks) {
        boolean drop = false;
        if (levels.contains(FilterLevel.STATISTICS)) {
            drop = StatisticsFilter.canDrop(filterPredicate, block.getColumns());
        }
        if (!drop && levels.contains(FilterLevel.DICTIONARY)) {
            drop = DictionaryFilter.canDrop(filterPredicate, block.getColumns(), reader.getDictionaryReader(block));
        }
        if (!drop) {
            filteredBlocks.add(block);
        }
    }
    return filteredBlocks;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ArrayList(java.util.ArrayList) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 5 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project parquet-mr by apache.

the class TestInputFormat method testGetFilter.

@Test
public void testGetFilter() throws IOException {
    IntColumn intColumn = intColumn("foo");
    FilterPredicate p = or(eq(intColumn, 7), eq(intColumn, 12));
    Configuration conf = new Configuration();
    ParquetInputFormat.setFilterPredicate(conf, p);
    Filter read = ParquetInputFormat.getFilter(conf);
    assertTrue(read instanceof FilterPredicateCompat);
    assertEquals(p, ((FilterPredicateCompat) read).getFilterPredicate());
    conf = new Configuration();
    ParquetInputFormat.setFilterPredicate(conf, not(p));
    read = ParquetInputFormat.getFilter(conf);
    assertTrue(read instanceof FilterPredicateCompat);
    assertEquals(and(notEq(intColumn, 7), notEq(intColumn, 12)), ((FilterPredicateCompat) read).getFilterPredicate());
    assertEquals(FilterCompat.NOOP, ParquetInputFormat.getFilter(new Configuration()));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Filter(org.apache.parquet.filter2.compat.FilterCompat.Filter) RecordFilter(org.apache.parquet.filter.RecordFilter) UnboundRecordFilter(org.apache.parquet.filter.UnboundRecordFilter) FilterPredicateCompat(org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) IntColumn(org.apache.parquet.filter2.predicate.Operators.IntColumn) Test(org.junit.Test)

Aggregations

FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)22 ArrayList (java.util.ArrayList)7 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)6 Path (org.apache.hadoop.fs.Path)5 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)5 FilterCompat (org.apache.parquet.filter2.compat.FilterCompat)4 MessageType (org.apache.parquet.schema.MessageType)4 HashSet (java.util.HashSet)3 List (java.util.List)3 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)3 Pair (uk.gov.gchq.gaffer.commonutil.pair.Pair)3 HashMap (java.util.HashMap)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)2 UnboundRecordFilter (org.apache.parquet.filter.UnboundRecordFilter)2 Filter (org.apache.parquet.filter2.compat.FilterCompat.Filter)2 FilterPredicateCompat (org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat)2 ParquetInputSplit (org.apache.parquet.hadoop.ParquetInputSplit)2 Test (org.junit.Test)2 ViewElementDefinition (uk.gov.gchq.gaffer.data.elementdefinition.view.ViewElementDefinition)2