Examples with VectorizedRowBatch - org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch

Example 6 with VectorizedRowBatch

use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project incubator-gobblin by apache.

the class GenericRecordToOrcValueWriterTest method testUnionRecordConversionWriter.

@Test
public void testUnionRecordConversionWriter() throws Exception {
    Schema schema = new Schema.Parser().parse(this.getClass().getClassLoader().getResourceAsStream("union_test/schema.avsc"));
    TypeDescription orcSchema = AvroOrcSchemaConverter.getOrcSchema(schema);
    GenericRecordToOrcValueWriter valueWriter = new GenericRecordToOrcValueWriter(orcSchema, schema);
    VectorizedRowBatch rowBatch = orcSchema.createRowBatch();
    List<GenericRecord> recordList = GobblinOrcWriterTest.deserializeAvroRecords(this.getClass(), schema, "union_test/data.json");
    for (GenericRecord record : recordList) {
        valueWriter.write(record, rowBatch);
    }
    // Flush RowBatch into disk.
    File tempFile = new File(Files.createTempDir(), "orc");
    tempFile.deleteOnExit();
    Path filePath = new Path(tempFile.getAbsolutePath());
    OrcFile.WriterOptions options = OrcFile.writerOptions(new Properties(), new Configuration());
    options.setSchema(orcSchema);
    Writer orcFileWriter = OrcFile.createWriter(filePath, options);
    orcFileWriter.addRowBatch(rowBatch);
    orcFileWriter.close();
    // Load it back and compare.
    FileSystem fs = FileSystem.get(new Configuration());
    List<Writable> orcRecords = deserializeOrcRecords(filePath, fs);
    Assert.assertEquals(orcRecords.size(), 5);
    // Knowing all of them are OrcStruct<OrcUnion>, save the effort to recursively convert GenericRecord to OrcStruct
    // for comprehensive comparison which is non-trivial,
    // although it is also theoretically possible and optimal way for doing this unit test.
    List<OrcUnion> unionList = orcRecords.stream().map(this::getUnionFieldFromStruct).collect(Collectors.toList());
    // Constructing all OrcUnion and verify all of them appears in unionList.
    TypeDescription unionSchema = orcSchema.getChildren().get(0);
    OrcUnion union_0 = new OrcUnion(unionSchema);
    union_0.set((byte) 0, new Text("urn:li:member:3"));
    Assert.assertTrue(unionList.contains(union_0));
    OrcUnion union_1 = new OrcUnion(unionSchema);
    union_1.set((byte) 0, new Text("urn:li:member:4"));
    Assert.assertTrue(unionList.contains(union_1));
    OrcUnion union_2 = new OrcUnion(unionSchema);
    union_2.set((byte) 1, new IntWritable(2));
    Assert.assertTrue(unionList.contains(union_2));
    OrcUnion union_3 = new OrcUnion(unionSchema);
    union_3.set((byte) 1, new IntWritable(1));
    Assert.assertTrue(unionList.contains(union_3));
    OrcUnion union_4 = new OrcUnion(unionSchema);
    union_4.set((byte) 1, new IntWritable(3));
    Assert.assertTrue(unionList.contains(union_4));
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) Text(org.apache.hadoop.io.Text) Properties(java.util.Properties) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) OrcFile(org.apache.orc.OrcFile) FileSystem(org.apache.hadoop.fs.FileSystem) TypeDescription(org.apache.orc.TypeDescription) OrcUnion(org.apache.orc.mapred.OrcUnion) GenericRecord(org.apache.avro.generic.GenericRecord) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Writer(org.apache.orc.Writer) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 7 with VectorizedRowBatch

use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project incubator-gobblin by apache.

the class GenericRecordToOrcValueWriterTest method testDecimalRecordConversionWriter.

@Test
public void testDecimalRecordConversionWriter() throws Exception {
    Schema schema = new Schema.Parser().parse(this.getClass().getClassLoader().getResourceAsStream("decimal_test/schema.avsc"));
    TypeDescription orcSchema = AvroOrcSchemaConverter.getOrcSchema(schema);
    GenericRecordToOrcValueWriter valueWriter = new GenericRecordToOrcValueWriter(orcSchema, schema);
    VectorizedRowBatch rowBatch = orcSchema.createRowBatch();
    List<GenericRecord> recordList = GobblinOrcWriterTest.deserializeAvroRecords(this.getClass(), schema, "decimal_test/data.json");
    for (GenericRecord record : recordList) {
        valueWriter.write(record, rowBatch);
    }
    // Flush RowBatch into disk.
    File tempFile = new File(Files.createTempDir(), "orc");
    tempFile.deleteOnExit();
    Path filePath = new Path(tempFile.getAbsolutePath());
    OrcFile.WriterOptions options = OrcFile.writerOptions(new Properties(), new Configuration());
    options.setSchema(orcSchema);
    Writer orcFileWriter = OrcFile.createWriter(filePath, options);
    orcFileWriter.addRowBatch(rowBatch);
    orcFileWriter.close();
    // Load it back and compare.
    FileSystem fs = FileSystem.get(new Configuration());
    List<Writable> orcRecords = deserializeOrcRecords(filePath, fs);
    Assert.assertEquals(orcRecords.size(), 2);
    Assert.assertEquals(orcRecords.get(0).toString(), "{3.4}");
    Assert.assertEquals(orcRecords.get(1).toString(), "{5.97}");
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) Properties(java.util.Properties) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) OrcFile(org.apache.orc.OrcFile) FileSystem(org.apache.hadoop.fs.FileSystem) TypeDescription(org.apache.orc.TypeDescription) GenericRecord(org.apache.avro.generic.GenericRecord) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Writer(org.apache.orc.Writer) Test(org.testng.annotations.Test)

Example 8 with VectorizedRowBatch

use of org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch in project incubator-gobblin by apache.

the class GenericRecordToOrcValueWriterTest method testListResize.

@Test
public void testListResize() throws Exception {
    Schema schema = new Schema.Parser().parse(this.getClass().getClassLoader().getResourceAsStream("list_map_test/schema.avsc"));
    TypeDescription orcSchema = AvroOrcSchemaConverter.getOrcSchema(schema);
    GenericRecordToOrcValueWriter valueWriter = new GenericRecordToOrcValueWriter(orcSchema, schema);
    // Make the batch size very small so that the enlarge behavior would easily be triggered.
    // But this has to more than the number of records that we deserialized form data.json, as here we don't reset batch.
    VectorizedRowBatch rowBatch = orcSchema.createRowBatch(10);
    List<GenericRecord> recordList = GobblinOrcWriterTest.deserializeAvroRecords(this.getClass(), schema, "list_map_test/data.json");
    Assert.assertEquals(recordList.size(), 6);
    for (GenericRecord record : recordList) {
        valueWriter.write(record, rowBatch);
    }
    // Examining resize count, which should happen only once for map and list, so totally 2.
    Assert.assertEquals(valueWriter.resizeCount, 2);
}

Also used : VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) Schema(org.apache.avro.Schema) TypeDescription(org.apache.orc.TypeDescription) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.testng.annotations.Test)

Aggregations

VectorizedRowBatch (org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch)8 TypeDescription (org.apache.orc.TypeDescription)6 Configuration (org.apache.hadoop.conf.Configuration)4 OrcFile (org.apache.orc.OrcFile)4 Properties (java.util.Properties)3 Schema (org.apache.avro.Schema)3 GenericRecord (org.apache.avro.generic.GenericRecord)3 Path (org.apache.hadoop.fs.Path)3 IntWritable (org.apache.hadoop.io.IntWritable)3 Writable (org.apache.hadoop.io.Writable)3 Writer (org.apache.orc.Writer)3 Test (org.testng.annotations.Test)3 File (java.io.File)2 OrcNoHiveShim (org.apache.flink.orc.nohive.shim.OrcNoHiveShim)2 VectorizedColumnBatch (org.apache.flink.table.data.columnar.vector.VectorizedColumnBatch)2 LogicalType (org.apache.flink.table.types.logical.LogicalType)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Timestamp (java.sql.Timestamp)1 ArrayList (java.util.ArrayList)1 BulkWriter (org.apache.flink.api.common.serialization.BulkWriter)1