Search in sources :

Example 6 with RecordConsumer

use of org.apache.parquet.io.api.RecordConsumer in project hive by apache.

the class TestArrayCompatibility method testUnannotatedListOfPrimitives.

@Test
public void testUnannotatedListOfPrimitives() throws Exception {
    MessageType fileSchema = Types.buildMessage().repeated(INT32).named("list_of_ints").named("UnannotatedListOfPrimitives");
    Path test = writeDirect("UnannotatedListOfPrimitives", fileSchema, new DirectWriter() {

        @Override
        public void write(RecordConsumer rc) {
            rc.startMessage();
            rc.startField("list_of_ints", 0);
            rc.addInteger(34);
            rc.addInteger(35);
            rc.addInteger(36);
            rc.endField("list_of_ints", 0);
            rc.endMessage();
        }
    });
    ArrayWritable expected = list(new IntWritable(34), new IntWritable(35), new IntWritable(36));
    List<ArrayWritable> records = read(test);
    Assert.assertEquals("Should have only one record", 1, records.size());
    assertEquals("Should match expected record", expected, records.get(0));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayWritable(org.apache.hadoop.io.ArrayWritable) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) MessageType(org.apache.parquet.schema.MessageType) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 7 with RecordConsumer

use of org.apache.parquet.io.api.RecordConsumer in project hive by apache.

the class TestArrayCompatibility method testAvroSingleFieldGroupInList.

@Test
public void testAvroSingleFieldGroupInList() throws Exception {
    // this tests the case where older data has an ambiguous structure, but the
    // correct interpretation can be determined from the repeated name, "array"
    Path test = writeDirect("AvroSingleFieldGroupInList", Types.buildMessage().optionalGroup().as(LIST).repeatedGroup().required(INT64).named("count").named("array").named("single_element_groups").named("AvroSingleFieldGroupInList"), new DirectWriter() {

        @Override
        public void write(RecordConsumer rc) {
            rc.startMessage();
            rc.startField("single_element_groups", 0);
            rc.startGroup();
            // start writing array contents
            rc.startField("array", 0);
            rc.startGroup();
            rc.startField("count", 0);
            rc.addLong(1234L);
            rc.endField("count", 0);
            rc.endGroup();
            rc.startGroup();
            rc.startField("count", 0);
            rc.addLong(2345L);
            rc.endField("count", 0);
            rc.endGroup();
            // finished writing array contents
            rc.endField("array", 0);
            rc.endGroup();
            rc.endField("single_element_groups", 0);
            rc.endMessage();
        }
    });
    ArrayWritable expected = list(record(new LongWritable(1234L)), record(new LongWritable(2345L)));
    List<ArrayWritable> records = read(test);
    Assert.assertEquals("Should have only one record", 1, records.size());
    assertEquals("Should match expected record", expected, records.get(0));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayWritable(org.apache.hadoop.io.ArrayWritable) LongWritable(org.apache.hadoop.io.LongWritable) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) Test(org.junit.Test)

Example 8 with RecordConsumer

use of org.apache.parquet.io.api.RecordConsumer in project hive by apache.

the class TestMapStructures method testDoubleMapWithStructValue.

@Test
public void testDoubleMapWithStructValue() throws Exception {
    Path test = writeDirect("DoubleMapWithStructValue", Types.buildMessage().optionalGroup().as(MAP).repeatedGroup().optional(DOUBLE).named("key").optionalGroup().required(INT32).named("x").required(INT32).named("y").named("value").named("key_value").named("approx").named("DoubleMapWithStructValue"), new TestArrayCompatibility.DirectWriter() {

        @Override
        public void write(RecordConsumer rc) {
            rc.startMessage();
            rc.startField("approx", 0);
            rc.startGroup();
            rc.startField("key_value", 0);
            rc.startGroup();
            rc.startField("key", 0);
            rc.addDouble(3.14);
            rc.endField("key", 0);
            rc.startField("value", 1);
            rc.startGroup();
            rc.startField("x", 0);
            rc.addInteger(7);
            rc.endField("x", 0);
            rc.startField("y", 1);
            rc.addInteger(22);
            rc.endField("y", 1);
            rc.endGroup();
            rc.endField("value", 1);
            rc.endGroup();
            rc.endField("key_value", 0);
            rc.endGroup();
            rc.endField("approx", 0);
            rc.endMessage();
        }
    });
    ArrayWritable expected = list(record(new DoubleWritable(3.14), record(new IntWritable(7), new IntWritable(22))));
    List<ArrayWritable> records = read(test);
    Assert.assertEquals("Should have only one record", 1, records.size());
    assertEquals("Should match expected record", expected, records.get(0));
    deserialize(records.get(0), Arrays.asList("approx"), Arrays.asList("map<bigint,struct<x:int,y:int>>"));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayWritable(org.apache.hadoop.io.ArrayWritable) DoubleWritable(org.apache.hadoop.hive.serde2.io.DoubleWritable) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 9 with RecordConsumer

use of org.apache.parquet.io.api.RecordConsumer in project hive by apache.

the class TestMapStructures method testStringMapOptionalPrimitive.

@Test
public void testStringMapOptionalPrimitive() throws Exception {
    Path test = writeDirect("StringMapOptionalPrimitive", Types.buildMessage().optionalGroup().as(MAP).repeatedGroup().required(BINARY).as(UTF8).named("key").optional(INT32).named("value").named("key_value").named("votes").named("StringMapOptionalPrimitive"), new TestArrayCompatibility.DirectWriter() {

        @Override
        public void write(RecordConsumer rc) {
            rc.startMessage();
            rc.startField("votes", 0);
            rc.startGroup();
            rc.startField("key_value", 0);
            rc.startGroup();
            rc.startField("key", 0);
            rc.addBinary(Binary.fromString("lettuce"));
            rc.endField("key", 0);
            rc.startField("value", 1);
            rc.addInteger(34);
            rc.endField("value", 1);
            rc.endGroup();
            rc.startGroup();
            rc.startField("key", 0);
            rc.addBinary(Binary.fromString("kale"));
            rc.endField("key", 0);
            // no value for kale
            rc.endGroup();
            rc.startGroup();
            rc.startField("key", 0);
            rc.addBinary(Binary.fromString("cabbage"));
            rc.endField("key", 0);
            rc.startField("value", 1);
            rc.addInteger(18);
            rc.endField("value", 1);
            rc.endGroup();
            rc.endField("key_value", 0);
            rc.endGroup();
            rc.endField("votes", 0);
            rc.endMessage();
        }
    });
    ArrayWritable expected = list(record(new Text("lettuce"), new IntWritable(34)), record(new Text("kale"), null), record(new Text("cabbage"), new IntWritable(18)));
    List<ArrayWritable> records = read(test);
    Assert.assertEquals("Should have only one record", 1, records.size());
    assertEquals("Should match expected record", expected, records.get(0));
    deserialize(records.get(0), Arrays.asList("votes"), Arrays.asList("map<string,int>"));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayWritable(org.apache.hadoop.io.ArrayWritable) Text(org.apache.hadoop.io.Text) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 10 with RecordConsumer

use of org.apache.parquet.io.api.RecordConsumer in project hive by apache.

the class TestParquetRowGroupFilter method testRowGroupFilterTakeEffect.

@Test
public void testRowGroupFilterTakeEffect() throws Exception {
    // define schema
    columnNames = "intCol";
    columnTypes = "int";
    StructObjectInspector inspector = getObjectInspector(columnNames, columnTypes);
    MessageType fileSchema = MessageTypeParser.parseMessageType("message hive_schema {\n" + "  optional int32 intCol;\n" + "}\n");
    conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "intCol");
    conf.set("columns", "intCol");
    conf.set("columns.types", "int");
    // create Parquet file with specific data
    Path testPath = writeDirect("RowGroupFilterTakeEffect", fileSchema, new DirectWriter() {

        @Override
        public void write(RecordConsumer consumer) {
            for (int i = 0; i < 100; i++) {
                consumer.startMessage();
                consumer.startField("int", 0);
                consumer.addInteger(i);
                consumer.endField("int", 0);
                consumer.endMessage();
            }
        }
    });
    // > 50
    GenericUDF udf = new GenericUDFOPGreaterThan();
    List<ExprNodeDesc> children = Lists.newArrayList();
    ExprNodeColumnDesc columnDesc = new ExprNodeColumnDesc(Integer.class, "intCol", "T", false);
    ExprNodeConstantDesc constantDesc = new ExprNodeConstantDesc(50);
    children.add(columnDesc);
    children.add(constantDesc);
    ExprNodeGenericFuncDesc genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
    String searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
    conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
    ParquetRecordReaderWrapper recordReader = (ParquetRecordReaderWrapper) new MapredParquetInputFormat().getRecordReader(new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);
    Assert.assertEquals("row group is not filtered correctly", 1, recordReader.getFiltedBlocks().size());
    // > 100
    constantDesc = new ExprNodeConstantDesc(100);
    children.set(1, constantDesc);
    genericFuncDesc = new ExprNodeGenericFuncDesc(inspector, udf, children);
    searchArgumentStr = SerializationUtilities.serializeExpression(genericFuncDesc);
    conf.set(TableScanDesc.FILTER_EXPR_CONF_STR, searchArgumentStr);
    recordReader = (ParquetRecordReaderWrapper) new MapredParquetInputFormat().getRecordReader(new FileSplit(testPath, 0, fileLength(testPath), (String[]) null), conf, null);
    Assert.assertEquals("row group is not filtered correctly", 0, recordReader.getFiltedBlocks().size());
}
Also used : Path(org.apache.hadoop.fs.Path) GenericUDFOPGreaterThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) ParquetRecordReaderWrapper(org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) FileSplit(org.apache.hadoop.mapred.FileSplit) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) MessageType(org.apache.parquet.schema.MessageType) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Aggregations

Path (org.apache.hadoop.fs.Path)19 RecordConsumer (org.apache.parquet.io.api.RecordConsumer)19 Test (org.junit.Test)19 ArrayWritable (org.apache.hadoop.io.ArrayWritable)18 IntWritable (org.apache.hadoop.io.IntWritable)9 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)6 Text (org.apache.hadoop.io.Text)5 LongWritable (org.apache.hadoop.io.LongWritable)3 MessageType (org.apache.parquet.schema.MessageType)2 ParquetRecordReaderWrapper (org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper)1 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)1 ExprNodeConstantDesc (org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc)1 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)1 ExprNodeGenericFuncDesc (org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc)1 GenericUDF (org.apache.hadoop.hive.ql.udf.generic.GenericUDF)1 GenericUDFOPGreaterThan (org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan)1 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)1 FloatWritable (org.apache.hadoop.io.FloatWritable)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1