Search in sources :

Example 6 with OrcUnion

use of org.apache.orc.mapred.OrcUnion in project incubator-gobblin by apache.

the class OrcUtils method structConversionHelper.

/**
 * This method copies value in object {@param w} into object {@param v} recursively even if the schema of w and v
 * differs in a compatible way, meaning if there's a field existing in v but not in w, the null value will be filled.
 * It served as a helper method for {@link #upConvertOrcStruct(OrcStruct, OrcStruct, TypeDescription)} when OrcStruct
 * contains nested structure as a member.
 *
 * Suppress the warning of type checking: All casts are clearly valid as they are all (sub)elements Orc types.
 * Check failure will trigger Cast exception and blow up the process.
 */
@SuppressWarnings("unchecked")
private static WritableComparable structConversionHelper(WritableComparable w, WritableComparable v, TypeDescription targetSchema) {
    if (w instanceof OrcStruct) {
        upConvertOrcStruct((OrcStruct) w, (OrcStruct) v, targetSchema);
    } else if (w instanceof OrcList) {
        OrcList castedList = (OrcList) w;
        OrcList targetList = (OrcList) v;
        TypeDescription elementType = targetSchema.getChildren().get(0);
        targetList.clear();
        for (int i = 0; i < castedList.size(); i++) {
            WritableComparable targetListRecordContainer = createValueRecursively(elementType, 0);
            targetList.add(i, structConversionHelper((WritableComparable) castedList.get(i), targetListRecordContainer, elementType));
        }
    } else if (w instanceof OrcMap) {
        OrcMap castedMap = (OrcMap) w;
        OrcMap targetMap = (OrcMap) v;
        TypeDescription valueSchema = targetSchema.getChildren().get(1);
        targetMap.clear();
        for (Object entry : castedMap.entrySet()) {
            Map.Entry<WritableComparable, WritableComparable> castedEntry = (Map.Entry<WritableComparable, WritableComparable>) entry;
            WritableComparable targetMapRecordContainer = createValueRecursively(valueSchema);
            targetMapRecordContainer = structConversionHelper(castedEntry.getValue(), targetMapRecordContainer, valueSchema);
            targetMap.put(castedEntry.getKey(), targetMapRecordContainer);
        }
    } else if (w instanceof OrcUnion) {
        OrcUnion castedUnion = (OrcUnion) w;
        OrcUnion targetUnion = (OrcUnion) v;
        byte tag = castedUnion.getTag();
        // ORC doesn't support Union type widening
        // Avro doesn't allow it either, reference: https://avro.apache.org/docs/current/spec.html#Schema+Resolution
        // As a result, member schema within source and target should be identical.
        TypeDescription targetMemberSchema = targetSchema.getChildren().get(tag);
        targetUnion.set(tag, structConversionHelper((WritableComparable) castedUnion.getObject(), (WritableComparable) OrcUtils.createValueRecursively(targetMemberSchema), targetMemberSchema));
    } else {
        // Regardless whether type-widening is happening or not, this method copy the value of w into v.
        handlePrimitiveWritableComparable(w, v);
    }
    // If non-primitive or type-widening is required, v should already be populated by w's value recursively.
    return v;
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) OrcList(org.apache.orc.mapred.OrcList) WritableComparable(org.apache.hadoop.io.WritableComparable) TypeDescription(org.apache.orc.TypeDescription) OrcUnion(org.apache.orc.mapred.OrcUnion) HashMap(java.util.HashMap) Map(java.util.Map) OrcMap(org.apache.orc.mapred.OrcMap) OrcMap(org.apache.orc.mapred.OrcMap)

Example 7 with OrcUnion

use of org.apache.orc.mapred.OrcUnion in project incubator-gobblin by apache.

the class OrcUtilsTest method testRandomFillOrcStructWithAnySchema.

@Test
public void testRandomFillOrcStructWithAnySchema() {
    // 1. Basic case
    TypeDescription schema_1 = TypeDescription.fromString("struct<i:int,j:int,k:int>");
    OrcStruct expectedStruct = (OrcStruct) OrcStruct.createValue(schema_1);
    expectedStruct.setFieldValue("i", new IntWritable(3));
    expectedStruct.setFieldValue("j", new IntWritable(3));
    expectedStruct.setFieldValue("k", new IntWritable(3));
    OrcStruct actualStruct = (OrcStruct) OrcStruct.createValue(schema_1);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_1, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct);
    TypeDescription schema_2 = TypeDescription.fromString("struct<i:boolean,j:int,k:string>");
    expectedStruct = (OrcStruct) OrcStruct.createValue(schema_2);
    expectedStruct.setFieldValue("i", new BooleanWritable(false));
    expectedStruct.setFieldValue("j", new IntWritable(3));
    expectedStruct.setFieldValue("k", new Text(""));
    actualStruct = (OrcStruct) OrcStruct.createValue(schema_2);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_2, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct);
    // 2. Some simple nested cases: struct within struct
    TypeDescription schema_3 = TypeDescription.fromString("struct<i:boolean,j:struct<i:boolean,j:int,k:string>>");
    OrcStruct expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_3);
    expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
    expectedStruct_nested_1.setFieldValue("j", expectedStruct);
    actualStruct = (OrcStruct) OrcStruct.createValue(schema_3);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_3, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct_nested_1);
    // 3. array of struct within struct
    TypeDescription schema_4 = TypeDescription.fromString("struct<i:boolean,j:array<struct<i:boolean,j:int,k:string>>>");
    // Note that this will not create any elements in the array.
    expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_4);
    expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
    OrcList list = new OrcList(schema_2, 1);
    list.add(expectedStruct);
    expectedStruct_nested_1.setFieldValue("j", list);
    // Constructing actualStruct: make sure the list is non-Empty. There's any meaningful value within placeholder struct.
    actualStruct = (OrcStruct) OrcStruct.createValue(schema_4);
    OrcList placeHolderList = new OrcList(schema_2, 1);
    OrcStruct placeHolderStruct = (OrcStruct) OrcStruct.createValue(schema_2);
    placeHolderList.add(placeHolderStruct);
    actualStruct.setFieldValue("j", placeHolderList);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_4, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct_nested_1);
    // 4. union of struct within struct
    TypeDescription schema_5 = TypeDescription.fromString("struct<i:boolean,j:uniontype<struct<i:boolean,j:int,k:string>>>");
    expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_5);
    expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
    OrcUnion union = new OrcUnion(schema_2);
    union.set(0, expectedStruct);
    expectedStruct_nested_1.setFieldValue("j", union);
    // Construct actualStruct: make sure there's a struct-placeholder within the union.
    actualStruct = (OrcStruct) OrcStruct.createValue(schema_5);
    OrcUnion placeHolderUnion = new OrcUnion(schema_2);
    placeHolderUnion.set(0, placeHolderStruct);
    actualStruct.setFieldValue("j", placeHolderUnion);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_5, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct_nested_1);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) BooleanWritable(org.apache.hadoop.io.BooleanWritable) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) Text(org.apache.hadoop.io.Text) OrcUnion(org.apache.orc.mapred.OrcUnion) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 8 with OrcUnion

use of org.apache.orc.mapred.OrcUnion in project incubator-gobblin by apache.

the class OrcUtilsTest method testUpConvertOrcStructOfUnion.

@Test
public void testUpConvertOrcStructOfUnion() {
    // Union in struct, type widening within the union's member field.
    TypeDescription unionInStructSchema = TypeDescription.fromString("struct<a:uniontype<int,string>>");
    OrcStruct unionInStruct = (OrcStruct) OrcStruct.createValue(unionInStructSchema);
    OrcUnion placeHolderUnion = new OrcUnion(TypeDescription.fromString("uniontype<int,string>"));
    placeHolderUnion.set(0, new IntWritable(1));
    unionInStruct.setFieldValue("a", placeHolderUnion);
    OrcTestUtils.fillOrcStructWithFixedValue(unionInStruct, unionInStructSchema, intValue1, stringValue1, boolValue);
    // Create new structWithUnion
    TypeDescription evolved_unionInStructSchema = TypeDescription.fromString("struct<a:uniontype<bigint,string>>");
    OrcStruct evolvedUnionInStruct = (OrcStruct) OrcStruct.createValue(evolved_unionInStructSchema);
    OrcUnion evolvedPlaceHolderUnion = new OrcUnion(TypeDescription.fromString("uniontype<bigint,string>"));
    evolvedPlaceHolderUnion.set(0, new LongWritable(1L));
    evolvedUnionInStruct.setFieldValue("a", evolvedPlaceHolderUnion);
    OrcUtils.upConvertOrcStruct(unionInStruct, evolvedUnionInStruct, evolved_unionInStructSchema);
    // Check in the tag 0(Default from value-filler) within evolvedUnionInStruct, the value is becoming type-widened with correct value.
    Assert.assertEquals(((OrcUnion) evolvedUnionInStruct.getFieldValue("a")).getTag(), 0);
    Assert.assertEquals(((OrcUnion) evolvedUnionInStruct.getFieldValue("a")).getObject(), new LongWritable(intValue1));
    // Check the case when union field is created in different tag.
    // Complex: List<Struct> within struct among others and evolution happens on multiple places, also type-widening in deeply nested level.
    TypeDescription complexOrcSchema = TypeDescription.fromString("struct<a:array<struct<a:string,b:int>>,b:struct<a:uniontype<int,string>>>");
    OrcStruct complexOrcStruct = (OrcStruct) OrcUtils.createValueRecursively(complexOrcSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(complexOrcStruct, complexOrcSchema, intValue1, stringValue1, boolValue);
    TypeDescription evolvedComplexOrcSchema = TypeDescription.fromString("struct<a:array<struct<a:string,b:bigint,c:string>>,b:struct<a:uniontype<bigint,string>,b:int>>");
    OrcStruct evolvedComplexStruct = (OrcStruct) OrcUtils.createValueRecursively(evolvedComplexOrcSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(evolvedComplexStruct, evolvedComplexOrcSchema, intValue1, stringValue1, boolValue);
    // Check if new columns are assigned with null value and type widening is working fine.
    OrcUtils.upConvertOrcStruct(complexOrcStruct, evolvedComplexStruct, evolvedComplexOrcSchema);
    Assert.assertEquals(((OrcStruct) ((OrcList) evolvedComplexStruct.getFieldValue("a")).get(0)).getFieldValue("b"), new LongWritable(intValue1));
    Assert.assertNull(((OrcStruct) ((OrcList) evolvedComplexStruct.getFieldValue("a")).get(0)).getFieldValue("c"));
    Assert.assertEquals(((OrcUnion) ((OrcStruct) evolvedComplexStruct.getFieldValue("b")).getFieldValue("a")).getObject(), new LongWritable(intValue1));
    Assert.assertNull(((OrcStruct) evolvedComplexStruct.getFieldValue("b")).getFieldValue("b"));
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) OrcUnion(org.apache.orc.mapred.OrcUnion) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 9 with OrcUnion

use of org.apache.orc.mapred.OrcUnion in project incubator-gobblin by apache.

the class GenericRecordToOrcValueWriterTest method testUnionRecordConversionWriter.

@Test
public void testUnionRecordConversionWriter() throws Exception {
    Schema schema = new Schema.Parser().parse(this.getClass().getClassLoader().getResourceAsStream("union_test/schema.avsc"));
    TypeDescription orcSchema = AvroOrcSchemaConverter.getOrcSchema(schema);
    GenericRecordToOrcValueWriter valueWriter = new GenericRecordToOrcValueWriter(orcSchema, schema);
    VectorizedRowBatch rowBatch = orcSchema.createRowBatch();
    List<GenericRecord> recordList = GobblinOrcWriterTest.deserializeAvroRecords(this.getClass(), schema, "union_test/data.json");
    for (GenericRecord record : recordList) {
        valueWriter.write(record, rowBatch);
    }
    // Flush RowBatch into disk.
    File tempFile = new File(Files.createTempDir(), "orc");
    tempFile.deleteOnExit();
    Path filePath = new Path(tempFile.getAbsolutePath());
    OrcFile.WriterOptions options = OrcFile.writerOptions(new Properties(), new Configuration());
    options.setSchema(orcSchema);
    Writer orcFileWriter = OrcFile.createWriter(filePath, options);
    orcFileWriter.addRowBatch(rowBatch);
    orcFileWriter.close();
    // Load it back and compare.
    FileSystem fs = FileSystem.get(new Configuration());
    List<Writable> orcRecords = deserializeOrcRecords(filePath, fs);
    Assert.assertEquals(orcRecords.size(), 5);
    // Knowing all of them are OrcStruct<OrcUnion>, save the effort to recursively convert GenericRecord to OrcStruct
    // for comprehensive comparison which is non-trivial,
    // although it is also theoretically possible and optimal way for doing this unit test.
    List<OrcUnion> unionList = orcRecords.stream().map(this::getUnionFieldFromStruct).collect(Collectors.toList());
    // Constructing all OrcUnion and verify all of them appears in unionList.
    TypeDescription unionSchema = orcSchema.getChildren().get(0);
    OrcUnion union_0 = new OrcUnion(unionSchema);
    union_0.set((byte) 0, new Text("urn:li:member:3"));
    Assert.assertTrue(unionList.contains(union_0));
    OrcUnion union_1 = new OrcUnion(unionSchema);
    union_1.set((byte) 0, new Text("urn:li:member:4"));
    Assert.assertTrue(unionList.contains(union_1));
    OrcUnion union_2 = new OrcUnion(unionSchema);
    union_2.set((byte) 1, new IntWritable(2));
    Assert.assertTrue(unionList.contains(union_2));
    OrcUnion union_3 = new OrcUnion(unionSchema);
    union_3.set((byte) 1, new IntWritable(1));
    Assert.assertTrue(unionList.contains(union_3));
    OrcUnion union_4 = new OrcUnion(unionSchema);
    union_4.set((byte) 1, new IntWritable(3));
    Assert.assertTrue(unionList.contains(union_4));
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) Text(org.apache.hadoop.io.Text) Properties(java.util.Properties) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) OrcFile(org.apache.orc.OrcFile) FileSystem(org.apache.hadoop.fs.FileSystem) TypeDescription(org.apache.orc.TypeDescription) OrcUnion(org.apache.orc.mapred.OrcUnion) GenericRecord(org.apache.avro.generic.GenericRecord) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Writer(org.apache.orc.Writer) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Aggregations

OrcUnion (org.apache.orc.mapred.OrcUnion)9 TypeDescription (org.apache.orc.TypeDescription)7 OrcStruct (org.apache.orc.mapred.OrcStruct)7 IntWritable (org.apache.hadoop.io.IntWritable)6 OrcList (org.apache.orc.mapred.OrcList)5 Test (org.testng.annotations.Test)5 Text (org.apache.hadoop.io.Text)3 Map (java.util.Map)2 Configuration (org.apache.hadoop.conf.Configuration)2 BooleanWritable (org.apache.hadoop.io.BooleanWritable)2 LongWritable (org.apache.hadoop.io.LongWritable)2 WritableComparable (org.apache.hadoop.io.WritableComparable)2 OrcMap (org.apache.orc.mapred.OrcMap)2 File (java.io.File)1 IOException (java.io.IOException)1 Field (java.lang.reflect.Field)1 HashMap (java.util.HashMap)1 Properties (java.util.Properties)1 Schema (org.apache.avro.Schema)1 GenericRecord (org.apache.avro.generic.GenericRecord)1