Search in sources :

Example 11 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class TestArrayCompatibility method testUnannotatedListOfPrimitives.

@Test
public void testUnannotatedListOfPrimitives() throws Exception {
    MessageType fileSchema = Types.buildMessage().repeated(INT32).named("list_of_ints").named("UnannotatedListOfPrimitives");
    Path test = writeDirect("UnannotatedListOfPrimitives", fileSchema, new DirectWriter() {

        @Override
        public void write(RecordConsumer rc) {
            rc.startMessage();
            rc.startField("list_of_ints", 0);
            rc.addInteger(34);
            rc.addInteger(35);
            rc.addInteger(36);
            rc.endField("list_of_ints", 0);
            rc.endMessage();
        }
    });
    ArrayWritable expected = list(new IntWritable(34), new IntWritable(35), new IntWritable(36));
    List<ArrayWritable> records = read(test);
    Assert.assertEquals("Should have only one record", 1, records.size());
    assertEquals("Should match expected record", expected, records.get(0));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayWritable(org.apache.hadoop.io.ArrayWritable) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) MessageType(org.apache.parquet.schema.MessageType) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 12 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class TestParquetRecordReaderWrapper method testBuilder.

@Test
public void testBuilder() throws Exception {
    SearchArgument sarg = SearchArgumentFactory.newBuilder().startNot().startOr().isNull("x", PredicateLeaf.Type.LONG).between("y", PredicateLeaf.Type.LONG, 10L, 20L).in("z", PredicateLeaf.Type.LONG, 1L, 2L, 3L).nullSafeEquals("a", PredicateLeaf.Type.STRING, "stinger").end().end().build();
    MessageType schema = MessageTypeParser.parseMessageType("message test {" + " optional int32 x; required int32 y; required int32 z;" + " optional binary a;}");
    FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
    String expected = "and(and(and(not(eq(x, null)), not(and(lteq(y, 20), not(lt(y, 10))))), not(or(or(eq(z, 1), " + "eq(z, 2)), eq(z, 3)))), not(eq(a, Binary{\"stinger\"})))";
    assertEquals(expected, p.toString());
}
Also used : SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 13 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class TestParquetRecordReaderWrapper method testBuilderFloat.

@Test
public void testBuilderFloat() throws Exception {
    SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("x", PredicateLeaf.Type.LONG, 22L).lessThan("x1", PredicateLeaf.Type.LONG, 22L).lessThanEquals("y", PredicateLeaf.Type.STRING, new HiveChar("hi", 10).toString()).equals("z", PredicateLeaf.Type.FLOAT, new Double(0.22)).equals("z1", PredicateLeaf.Type.FLOAT, new Double(0.22)).end().build();
    MessageType schema = MessageTypeParser.parseMessageType("message test {" + " required int32 x; required int32 x1;" + " required binary y; required float z; required float z1;}");
    FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
    String expected = "and(and(and(and(lt(x, 22), lt(x1, 22))," + " lteq(y, Binary{\"hi        \"})), eq(z, " + "0.22)), eq(z1, 0.22))";
    assertEquals(expected, p.toString());
}
Also used : HiveChar(org.apache.hadoop.hive.common.type.HiveChar) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 14 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class TestParquetRowGroupFilter method testRowGroupFilterTakeEffect.

@Test
public void testRowGroupFilterTakeEffect() throws Exception {
    // define schema
    columnNames = "intCol";
    columnTypes = "int";
    StructObjectInspector inspector = getObjectInspector(columnNames, columnTypes);
    MessageType fileSchema = MessageTypeParser.parseMessageType("message hive_schema {\n" + "  optional int32 intCol;\n" + "}\n");
    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "intCol");
    conf.set("columns", "intCol");
    conf.set("columns.types", "int");
    // create Parquet file with specific data
    Path testPath = writeDirect("RowGroupFilterTakeEffect", fileSchema, new DirectWriter() {

        @Override
        public void write(RecordConsumer consumer) {
            for (int i = 0; i < 100; i++) {
                consumer.startMessage();
                consumer.startField("int", 0);
                consumer.addInteger(i);
                consumer.endField("int", 0);
                consumer.endMessage();
            }
        }
    });
    // > 50
    GenericUDF udf = new GenericUDFOPGreaterThan();
    List<ExprNodeDesc> children = Lists.newArrayList();
    ExprNodeColumnDesc columnDesc = new ExprNodeColumnDesc(Integer.class, "intCol", "T", false);
    ExprNodeConstantDesc constantDesc = new ExprNodeConstantDesc(50);
    children.add(columnDesc);
    children.add(constantDesc);
    ExprNodeGenericFuncDesc genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
    String searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
    conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
    ParquetRecordReaderWrapper recordReader = (ParquetRecordReaderWrapper) new MapredParquetInputFormat().getRecordReader(new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);
    Assert.assertEquals("row group is not filtered correctly", 1, recordReader.getFiltedBlocks().size());
    // > 100
    constantDesc = new ExprNodeConstantDesc(100);
    children.set(1, constantDesc);
    genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
    searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
    conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
    recordReader = (ParquetRecordReaderWrapper) new MapredParquetInputFormat().getRecordReader(new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);
    Assert.assertEquals("row group is not filtered correctly", 0, recordReader.getFiltedBlocks().size());
}
Also used : Path(org.apache.hadoop.fs.Path) GenericUDFOPGreaterThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) ParquetRecordReaderWrapper(org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) FileSplit(org.apache.hadoop.mapred.FileSplit) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) MessageType(org.apache.parquet.schema.MessageType) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 15 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
    jobConf = configuration;
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    if (rowGroupOffsets == null) {
        //TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readFooter(configuration, file, NO_FILTER);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    MessageType tableSchema;
    if (indexAccess) {
        List<Integer> indexSequence = new ArrayList<>();
        // Generates a sequence list of indexes
        for (int i = 0; i < columnNamesList.size(); i++) {
            indexSequence.add(i);
        }
        tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
    } else {
        tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
    }
    indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
    if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
        requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
    } else {
        requestedSchema = fileSchema;
    }
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Aggregations

MessageType (org.apache.parquet.schema.MessageType)40 Test (org.junit.Test)23 FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)13 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)7 Type (org.apache.parquet.schema.Type)7 Path (org.apache.hadoop.fs.Path)6 GroupType (org.apache.parquet.schema.GroupType)6 Configuration (org.apache.hadoop.conf.Configuration)5 ArrayList (java.util.ArrayList)4 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)4 OriginalType (org.apache.parquet.schema.OriginalType)4 HashMap (java.util.HashMap)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 HiveChar (org.apache.hadoop.hive.common.type.HiveChar)3 PrimitiveType (org.apache.parquet.schema.PrimitiveType)3 DimensionSchema (io.druid.data.input.impl.DimensionSchema)2 File (java.io.File)2 HiveVarchar (org.apache.hadoop.hive.common.type.HiveVarchar)2 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)2 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)2