use of org.apache.orc.mapred.OrcUnion in project incubator-gobblin by apache.
the class OrcUtils method structConversionHelper.
/**
* This method copies value in object {@param w} into object {@param v} recursively even if the schema of w and v
* differs in a compatible way, meaning if there's a field existing in v but not in w, the null value will be filled.
* It served as a helper method for {@link #upConvertOrcStruct(OrcStruct, OrcStruct, TypeDescription)} when OrcStruct
* contains nested structure as a member.
*
* Suppress the warning of type checking: All casts are clearly valid as they are all (sub)elements Orc types.
* Check failure will trigger Cast exception and blow up the process.
*/
@SuppressWarnings("unchecked")
private static WritableComparable structConversionHelper(WritableComparable w, WritableComparable v, TypeDescription targetSchema) {
if (w instanceof OrcStruct) {
upConvertOrcStruct((OrcStruct) w, (OrcStruct) v, targetSchema);
} else if (w instanceof OrcList) {
OrcList castedList = (OrcList) w;
OrcList targetList = (OrcList) v;
TypeDescription elementType = targetSchema.getChildren().get(0);
targetList.clear();
for (int i = 0; i < castedList.size(); i++) {
WritableComparable targetListRecordContainer = createValueRecursively(elementType, 0);
targetList.add(i, structConversionHelper((WritableComparable) castedList.get(i), targetListRecordContainer, elementType));
}
} else if (w instanceof OrcMap) {
OrcMap castedMap = (OrcMap) w;
OrcMap targetMap = (OrcMap) v;
TypeDescription valueSchema = targetSchema.getChildren().get(1);
targetMap.clear();
for (Object entry : castedMap.entrySet()) {
Map.Entry<WritableComparable, WritableComparable> castedEntry = (Map.Entry<WritableComparable, WritableComparable>) entry;
WritableComparable targetMapRecordContainer = createValueRecursively(valueSchema);
targetMapRecordContainer = structConversionHelper(castedEntry.getValue(), targetMapRecordContainer, valueSchema);
targetMap.put(castedEntry.getKey(), targetMapRecordContainer);
}
} else if (w instanceof OrcUnion) {
OrcUnion castedUnion = (OrcUnion) w;
OrcUnion targetUnion = (OrcUnion) v;
byte tag = castedUnion.getTag();
// ORC doesn't support Union type widening
// Avro doesn't allow it either, reference: https://avro.apache.org/docs/current/spec.html#Schema+Resolution
// As a result, member schema within source and target should be identical.
TypeDescription targetMemberSchema = targetSchema.getChildren().get(tag);
targetUnion.set(tag, structConversionHelper((WritableComparable) castedUnion.getObject(), (WritableComparable) OrcUtils.createValueRecursively(targetMemberSchema), targetMemberSchema));
} else {
// Regardless whether type-widening is happening or not, this method copy the value of w into v.
handlePrimitiveWritableComparable(w, v);
}
// If non-primitive or type-widening is required, v should already be populated by w's value recursively.
return v;
}
use of org.apache.orc.mapred.OrcUnion in project incubator-gobblin by apache.
the class OrcUtilsTest method testRandomFillOrcStructWithAnySchema.
@Test
public void testRandomFillOrcStructWithAnySchema() {
// 1. Basic case
TypeDescription schema_1 = TypeDescription.fromString("struct<i:int,j:int,k:int>");
OrcStruct expectedStruct = (OrcStruct) OrcStruct.createValue(schema_1);
expectedStruct.setFieldValue("i", new IntWritable(3));
expectedStruct.setFieldValue("j", new IntWritable(3));
expectedStruct.setFieldValue("k", new IntWritable(3));
OrcStruct actualStruct = (OrcStruct) OrcStruct.createValue(schema_1);
OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_1, 3, "", false);
Assert.assertEquals(actualStruct, expectedStruct);
TypeDescription schema_2 = TypeDescription.fromString("struct<i:boolean,j:int,k:string>");
expectedStruct = (OrcStruct) OrcStruct.createValue(schema_2);
expectedStruct.setFieldValue("i", new BooleanWritable(false));
expectedStruct.setFieldValue("j", new IntWritable(3));
expectedStruct.setFieldValue("k", new Text(""));
actualStruct = (OrcStruct) OrcStruct.createValue(schema_2);
OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_2, 3, "", false);
Assert.assertEquals(actualStruct, expectedStruct);
// 2. Some simple nested cases: struct within struct
TypeDescription schema_3 = TypeDescription.fromString("struct<i:boolean,j:struct<i:boolean,j:int,k:string>>");
OrcStruct expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_3);
expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
expectedStruct_nested_1.setFieldValue("j", expectedStruct);
actualStruct = (OrcStruct) OrcStruct.createValue(schema_3);
OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_3, 3, "", false);
Assert.assertEquals(actualStruct, expectedStruct_nested_1);
// 3. array of struct within struct
TypeDescription schema_4 = TypeDescription.fromString("struct<i:boolean,j:array<struct<i:boolean,j:int,k:string>>>");
// Note that this will not create any elements in the array.
expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_4);
expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
OrcList list = new OrcList(schema_2, 1);
list.add(expectedStruct);
expectedStruct_nested_1.setFieldValue("j", list);
// Constructing actualStruct: make sure the list is non-Empty. There's any meaningful value within placeholder struct.
actualStruct = (OrcStruct) OrcStruct.createValue(schema_4);
OrcList placeHolderList = new OrcList(schema_2, 1);
OrcStruct placeHolderStruct = (OrcStruct) OrcStruct.createValue(schema_2);
placeHolderList.add(placeHolderStruct);
actualStruct.setFieldValue("j", placeHolderList);
OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_4, 3, "", false);
Assert.assertEquals(actualStruct, expectedStruct_nested_1);
// 4. union of struct within struct
TypeDescription schema_5 = TypeDescription.fromString("struct<i:boolean,j:uniontype<struct<i:boolean,j:int,k:string>>>");
expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_5);
expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
OrcUnion union = new OrcUnion(schema_2);
union.set(0, expectedStruct);
expectedStruct_nested_1.setFieldValue("j", union);
// Construct actualStruct: make sure there's a struct-placeholder within the union.
actualStruct = (OrcStruct) OrcStruct.createValue(schema_5);
OrcUnion placeHolderUnion = new OrcUnion(schema_2);
placeHolderUnion.set(0, placeHolderStruct);
actualStruct.setFieldValue("j", placeHolderUnion);
OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_5, 3, "", false);
Assert.assertEquals(actualStruct, expectedStruct_nested_1);
}
use of org.apache.orc.mapred.OrcUnion in project incubator-gobblin by apache.
the class OrcUtilsTest method testUpConvertOrcStructOfUnion.
@Test
public void testUpConvertOrcStructOfUnion() {
// Union in struct, type widening within the union's member field.
TypeDescription unionInStructSchema = TypeDescription.fromString("struct<a:uniontype<int,string>>");
OrcStruct unionInStruct = (OrcStruct) OrcStruct.createValue(unionInStructSchema);
OrcUnion placeHolderUnion = new OrcUnion(TypeDescription.fromString("uniontype<int,string>"));
placeHolderUnion.set(0, new IntWritable(1));
unionInStruct.setFieldValue("a", placeHolderUnion);
OrcTestUtils.fillOrcStructWithFixedValue(unionInStruct, unionInStructSchema, intValue1, stringValue1, boolValue);
// Create new structWithUnion
TypeDescription evolved_unionInStructSchema = TypeDescription.fromString("struct<a:uniontype<bigint,string>>");
OrcStruct evolvedUnionInStruct = (OrcStruct) OrcStruct.createValue(evolved_unionInStructSchema);
OrcUnion evolvedPlaceHolderUnion = new OrcUnion(TypeDescription.fromString("uniontype<bigint,string>"));
evolvedPlaceHolderUnion.set(0, new LongWritable(1L));
evolvedUnionInStruct.setFieldValue("a", evolvedPlaceHolderUnion);
OrcUtils.upConvertOrcStruct(unionInStruct, evolvedUnionInStruct, evolved_unionInStructSchema);
// Check in the tag 0(Default from value-filler) within evolvedUnionInStruct, the value is becoming type-widened with correct value.
Assert.assertEquals(((OrcUnion) evolvedUnionInStruct.getFieldValue("a")).getTag(), 0);
Assert.assertEquals(((OrcUnion) evolvedUnionInStruct.getFieldValue("a")).getObject(), new LongWritable(intValue1));
// Check the case when union field is created in different tag.
// Complex: List<Struct> within struct among others and evolution happens on multiple places, also type-widening in deeply nested level.
TypeDescription complexOrcSchema = TypeDescription.fromString("struct<a:array<struct<a:string,b:int>>,b:struct<a:uniontype<int,string>>>");
OrcStruct complexOrcStruct = (OrcStruct) OrcUtils.createValueRecursively(complexOrcSchema);
OrcTestUtils.fillOrcStructWithFixedValue(complexOrcStruct, complexOrcSchema, intValue1, stringValue1, boolValue);
TypeDescription evolvedComplexOrcSchema = TypeDescription.fromString("struct<a:array<struct<a:string,b:bigint,c:string>>,b:struct<a:uniontype<bigint,string>,b:int>>");
OrcStruct evolvedComplexStruct = (OrcStruct) OrcUtils.createValueRecursively(evolvedComplexOrcSchema);
OrcTestUtils.fillOrcStructWithFixedValue(evolvedComplexStruct, evolvedComplexOrcSchema, intValue1, stringValue1, boolValue);
// Check if new columns are assigned with null value and type widening is working fine.
OrcUtils.upConvertOrcStruct(complexOrcStruct, evolvedComplexStruct, evolvedComplexOrcSchema);
Assert.assertEquals(((OrcStruct) ((OrcList) evolvedComplexStruct.getFieldValue("a")).get(0)).getFieldValue("b"), new LongWritable(intValue1));
Assert.assertNull(((OrcStruct) ((OrcList) evolvedComplexStruct.getFieldValue("a")).get(0)).getFieldValue("c"));
Assert.assertEquals(((OrcUnion) ((OrcStruct) evolvedComplexStruct.getFieldValue("b")).getFieldValue("a")).getObject(), new LongWritable(intValue1));
Assert.assertNull(((OrcStruct) evolvedComplexStruct.getFieldValue("b")).getFieldValue("b"));
}
use of org.apache.orc.mapred.OrcUnion in project incubator-gobblin by apache.
the class GenericRecordToOrcValueWriterTest method testUnionRecordConversionWriter.
@Test
public void testUnionRecordConversionWriter() throws Exception {
Schema schema = new Schema.Parser().parse(this.getClass().getClassLoader().getResourceAsStream("union_test/schema.avsc"));
TypeDescription orcSchema = AvroOrcSchemaConverter.getOrcSchema(schema);
GenericRecordToOrcValueWriter valueWriter = new GenericRecordToOrcValueWriter(orcSchema, schema);
VectorizedRowBatch rowBatch = orcSchema.createRowBatch();
List<GenericRecord> recordList = GobblinOrcWriterTest.deserializeAvroRecords(this.getClass(), schema, "union_test/data.json");
for (GenericRecord record : recordList) {
valueWriter.write(record, rowBatch);
}
// Flush RowBatch into disk.
File tempFile = new File(Files.createTempDir(), "orc");
tempFile.deleteOnExit();
Path filePath = new Path(tempFile.getAbsolutePath());
OrcFile.WriterOptions options = OrcFile.writerOptions(new Properties(), new Configuration());
options.setSchema(orcSchema);
Writer orcFileWriter = OrcFile.createWriter(filePath, options);
orcFileWriter.addRowBatch(rowBatch);
orcFileWriter.close();
// Load it back and compare.
FileSystem fs = FileSystem.get(new Configuration());
List<Writable> orcRecords = deserializeOrcRecords(filePath, fs);
Assert.assertEquals(orcRecords.size(), 5);
// Knowing all of them are OrcStruct<OrcUnion>, save the effort to recursively convert GenericRecord to OrcStruct
// for comprehensive comparison which is non-trivial,
// although it is also theoretically possible and optimal way for doing this unit test.
List<OrcUnion> unionList = orcRecords.stream().map(this::getUnionFieldFromStruct).collect(Collectors.toList());
// Constructing all OrcUnion and verify all of them appears in unionList.
TypeDescription unionSchema = orcSchema.getChildren().get(0);
OrcUnion union_0 = new OrcUnion(unionSchema);
union_0.set((byte) 0, new Text("urn:li:member:3"));
Assert.assertTrue(unionList.contains(union_0));
OrcUnion union_1 = new OrcUnion(unionSchema);
union_1.set((byte) 0, new Text("urn:li:member:4"));
Assert.assertTrue(unionList.contains(union_1));
OrcUnion union_2 = new OrcUnion(unionSchema);
union_2.set((byte) 1, new IntWritable(2));
Assert.assertTrue(unionList.contains(union_2));
OrcUnion union_3 = new OrcUnion(unionSchema);
union_3.set((byte) 1, new IntWritable(1));
Assert.assertTrue(unionList.contains(union_3));
OrcUnion union_4 = new OrcUnion(unionSchema);
union_4.set((byte) 1, new IntWritable(3));
Assert.assertTrue(unionList.contains(union_4));
}
Aggregations