Search in sources :

Example 21 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.

the class FilterPredicateLeafBuilder method buildPredicate.

/**
 * Build filter predicate with multiple constants
 *
 * @param op         IN or BETWEEN
 * @param literals
 * @param columnName
 * @param columnType
 * @return
 */
public FilterPredicate buildPredicate(PredicateLeaf.Operator op, List<Object> literals, String columnName, TypeInfo columnType) throws Exception {
    FilterPredicate result = null;
    switch(op) {
        case IN:
            for (Object literal : literals) {
                if (result == null) {
                    result = buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName, columnType);
                } else {
                    result = or(result, buildPredict(PredicateLeaf.Operator.EQUALS, literal, columnName, columnType));
                }
            }
            return result;
        case BETWEEN:
            if (literals.size() != 2) {
                throw new RuntimeException("Not able to build 'between' operation filter with " + literals + " which needs two literals");
            }
            Object min = literals.get(0);
            Object max = literals.get(1);
            FilterPredicate lt = not(buildPredict(PredicateLeaf.Operator.LESS_THAN, min, columnName, columnType));
            FilterPredicate gt = buildPredict(PredicateLeaf.Operator.LESS_THAN_EQUALS, max, columnName, columnType);
            result = FilterApi.and(gt, lt);
            return result;
        default:
            throw new RuntimeException("Unknown PredicateLeaf Operator type: " + op);
    }
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Example 22 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.

the class ParquetRecordReaderBase method getSplit.

/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
@SuppressWarnings("deprecation")
protected ParquetInputSplit getSplit(final org.apache.hadoop.mapred.InputSplit oldSplit, final JobConf conf) throws IOException {
    if (oldSplit.getLength() == 0) {
        return null;
    }
    ParquetInputSplit split;
    if (oldSplit instanceof FileSplit) {
        final Path finalPath = ((FileSplit) oldSplit).getPath();
        jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());
        // TODO enable MetadataFilter by using readFooter(Configuration configuration, Path file,
        // MetadataFilter filter) API
        final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
        final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
        final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        final ReadSupport.ReadContext readContext = new DataWritableReadSupport().init(new InitContext(jobConf, null, fileMetaData.getSchema()));
        // Compute stats
        for (BlockMetaData bmd : blocks) {
            serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount());
            serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize());
        }
        schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
        final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
        final long splitStart = ((FileSplit) oldSplit).getStart();
        final long splitLength = ((FileSplit) oldSplit).getLength();
        for (final BlockMetaData block : blocks) {
            final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
            if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
                splitGroup.add(block);
            }
        }
        if (splitGroup.isEmpty()) {
            LOG.warn("Skipping split, could not find row group in: " + oldSplit);
            return null;
        }
        FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
        if (filter != null) {
            filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
            if (filtedBlocks.isEmpty()) {
                LOG.debug("All row groups are dropped due to filter predicates");
                return null;
            }
            long droppedBlocks = splitGroup.size() - filtedBlocks.size();
            if (droppedBlocks > 0) {
                LOG.debug("Dropping " + droppedBlocks + " row groups that do not pass filter predicate");
            }
        } else {
            filtedBlocks = splitGroup;
        }
        if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION)) {
            skipTimestampConversion = !Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr");
        }
        skipProlepticConversion = DataWritableReadSupport.getWriterDateProleptic(fileMetaData.getKeyValueMetaData());
        if (skipProlepticConversion == null) {
            skipProlepticConversion = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PARQUET_DATE_PROLEPTIC_GREGORIAN_DEFAULT);
        }
        legacyConversionEnabled = HiveConf.getBoolVar(conf, ConfVars.HIVE_PARQUET_TIMESTAMP_LEGACY_CONVERSION_ENABLED);
        if (fileMetaData.getKeyValueMetaData().containsKey(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY)) {
            legacyConversionEnabled = Boolean.parseBoolean(fileMetaData.getKeyValueMetaData().get(DataWritableWriteSupport.WRITER_ZONE_CONVERSION_LEGACY));
        }
        split = new ParquetInputSplit(finalPath, splitStart, splitLength, oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
        return split;
    } else {
        throw new IllegalArgumentException("Unknown split type: " + oldSplit);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) DataWritableReadSupport(org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapred.FileSplit) ReadSupport(org.apache.parquet.hadoop.api.ReadSupport) DataWritableReadSupport(org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport) InitContext(org.apache.parquet.hadoop.api.InitContext) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Example 23 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.

the class ParquetRecordReaderBase method setFilter.

public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) {
    SearchArgument sarg = ConvertAstToSearchArg.createFromConf(conf);
    if (sarg == null) {
        return null;
    }
    String columnTypes = conf.get(IOConstants.COLUMNS_TYPES);
    String columnNames = conf.get(IOConstants.COLUMNS);
    List<TypeInfo> columnTypeList = TypeInfoUtils.getTypeInfosFromTypeString(columnTypes);
    Map<String, TypeInfo> columns = new HashMap<>();
    String[] names = columnNames.split(",");
    for (int i = 0; i < names.length; i++) {
        columns.put(names[i], columnTypeList.get(i));
    }
    // Create the Parquet FilterPredicate without including columns that do not exist
    // on the schema (such as partition columns).
    FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema, columns);
    if (p != null) {
        // Filter may have sensitive information. Do not send to debug.
        LOG.debug("PARQUET predicate push down generated.");
        ParquetInputFormat.setFilterPredicate(conf, p);
        return FilterCompat.get(p);
    } else {
        // Filter may have sensitive information. Do not send to debug.
        LOG.debug("No PARQUET predicate push down is generated.");
        return null;
    }
}
Also used : HashMap(java.util.HashMap) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)

Example 24 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project hive by apache.

the class TestParquetFilterPredicate method testFilterColumnsThatDoNoExistOnSchema.

@Test
public void testFilterColumnsThatDoNoExistOnSchema() {
    MessageType schema = MessageTypeParser.parseMessageType("message test { required int32 a; required binary stinger; }");
    SearchArgument sarg = SearchArgumentFactory.newBuilder().startNot().startOr().isNull("a", PredicateLeaf.Type.LONG).between("y", PredicateLeaf.Type.LONG, 10L, // Column will be removed from filter
    20L).in("z", PredicateLeaf.Type.LONG, 1L, 2L, // Column will be removed from filter
    3L).nullSafeEquals("stinger", PredicateLeaf.Type.STRING, "stinger").end().end().build();
    Map<String, TypeInfo> columnTypes = new HashMap<>();
    columnTypes.put("a", TypeInfoFactory.getPrimitiveTypeInfo("int"));
    columnTypes.put("y", TypeInfoFactory.getPrimitiveTypeInfo("int"));
    columnTypes.put("z", TypeInfoFactory.getPrimitiveTypeInfo("int"));
    columnTypes.put("stinger", TypeInfoFactory.getPrimitiveTypeInfo("string"));
    FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema, columnTypes);
    String expected = "and(not(eq(a, null)), not(eq(stinger, Binary{\"stinger\"})))";
    assertEquals(expected, p.toString());
}
Also used : HashMap(java.util.HashMap) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 25 with Filter

use of org.apache.parquet.filter2.compat.FilterCompat.Filter in project Gaffer by gchq.

the class QueryGenerator method getIsEqualFilter.

private FilterPredicate getIsEqualFilter(final String colName, final Object[] parquetObjects, final String group) {
    String[] paths = schemaUtils.getPaths(group, colName);
    if (null == paths) {
        paths = new String[1];
        paths[0] = colName;
    }
    FilterPredicate filter = null;
    for (int i = 0; i < paths.length; i++) {
        final String path = paths[i];
        FilterPredicate tempFilter;
        if (parquetObjects[i] instanceof String) {
            tempFilter = eq(binaryColumn(path), Binary.fromString((String) parquetObjects[i]));
        } else if (parquetObjects[i] instanceof Boolean) {
            tempFilter = eq(booleanColumn(path), (Boolean) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Double) {
            tempFilter = eq(doubleColumn(path), (Double) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Float) {
            tempFilter = eq(floatColumn(path), (Float) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Integer) {
            tempFilter = eq(intColumn(path), (Integer) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof Long) {
            tempFilter = eq(longColumn(path), (Long) parquetObjects[i]);
        } else if (parquetObjects[i] instanceof java.util.Date) {
            tempFilter = eq(longColumn(path), ((java.util.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof java.sql.Date) {
            tempFilter = eq(longColumn(path), ((java.sql.Date) parquetObjects[i]).getTime());
        } else if (parquetObjects[i] instanceof Short) {
            tempFilter = eq(intColumn(path), ((Short) parquetObjects[i]).intValue());
        } else if (parquetObjects[i] instanceof byte[]) {
            tempFilter = eq(binaryColumn(path), Binary.fromReusedByteArray((byte[]) parquetObjects[i]));
        } else {
            LOGGER.warn(parquetObjects[i].getClass().getCanonicalName() + " is not a natively supported type for the IsEqual filter, therefore execution will take longer to perform this filter.");
            return null;
        }
        if (null == filter) {
            filter = tempFilter;
        } else {
            filter = and(filter, tempFilter);
        }
    }
    return filter;
}
Also used : FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate)

Aggregations

FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)24 ArrayList (java.util.ArrayList)7 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)6 Path (org.apache.hadoop.fs.Path)5 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)5 MessageType (org.apache.parquet.schema.MessageType)5 HashMap (java.util.HashMap)4 FilterCompat (org.apache.parquet.filter2.compat.FilterCompat)4 HashSet (java.util.HashSet)3 List (java.util.List)3 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)3 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)3 Pair (uk.gov.gchq.gaffer.commonutil.pair.Pair)3 FileSystem (org.apache.hadoop.fs.FileSystem)2 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)2 UnboundRecordFilter (org.apache.parquet.filter.UnboundRecordFilter)2 Filter (org.apache.parquet.filter2.compat.FilterCompat.Filter)2 FilterPredicateCompat (org.apache.parquet.filter2.compat.FilterCompat.FilterPredicateCompat)2 ParquetInputSplit (org.apache.parquet.hadoop.ParquetInputSplit)2 Test (org.junit.Test)2