Search in sources :

Example 36 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtilsTest method testRandomFillOrcStructWithAnySchema.

@Test
public void testRandomFillOrcStructWithAnySchema() {
    // 1. Basic case
    TypeDescription schema_1 = TypeDescription.fromString("struct<i:int,j:int,k:int>");
    OrcStruct expectedStruct = (OrcStruct) OrcStruct.createValue(schema_1);
    expectedStruct.setFieldValue("i", new IntWritable(3));
    expectedStruct.setFieldValue("j", new IntWritable(3));
    expectedStruct.setFieldValue("k", new IntWritable(3));
    OrcStruct actualStruct = (OrcStruct) OrcStruct.createValue(schema_1);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_1, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct);
    TypeDescription schema_2 = TypeDescription.fromString("struct<i:boolean,j:int,k:string>");
    expectedStruct = (OrcStruct) OrcStruct.createValue(schema_2);
    expectedStruct.setFieldValue("i", new BooleanWritable(false));
    expectedStruct.setFieldValue("j", new IntWritable(3));
    expectedStruct.setFieldValue("k", new Text(""));
    actualStruct = (OrcStruct) OrcStruct.createValue(schema_2);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_2, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct);
    // 2. Some simple nested cases: struct within struct
    TypeDescription schema_3 = TypeDescription.fromString("struct<i:boolean,j:struct<i:boolean,j:int,k:string>>");
    OrcStruct expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_3);
    expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
    expectedStruct_nested_1.setFieldValue("j", expectedStruct);
    actualStruct = (OrcStruct) OrcStruct.createValue(schema_3);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_3, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct_nested_1);
    // 3. array of struct within struct
    TypeDescription schema_4 = TypeDescription.fromString("struct<i:boolean,j:array<struct<i:boolean,j:int,k:string>>>");
    // Note that this will not create any elements in the array.
    expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_4);
    expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
    OrcList list = new OrcList(schema_2, 1);
    list.add(expectedStruct);
    expectedStruct_nested_1.setFieldValue("j", list);
    // Constructing actualStruct: make sure the list is non-Empty. There's any meaningful value within placeholder struct.
    actualStruct = (OrcStruct) OrcStruct.createValue(schema_4);
    OrcList placeHolderList = new OrcList(schema_2, 1);
    OrcStruct placeHolderStruct = (OrcStruct) OrcStruct.createValue(schema_2);
    placeHolderList.add(placeHolderStruct);
    actualStruct.setFieldValue("j", placeHolderList);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_4, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct_nested_1);
    // 4. union of struct within struct
    TypeDescription schema_5 = TypeDescription.fromString("struct<i:boolean,j:uniontype<struct<i:boolean,j:int,k:string>>>");
    expectedStruct_nested_1 = (OrcStruct) OrcStruct.createValue(schema_5);
    expectedStruct_nested_1.setFieldValue("i", new BooleanWritable(false));
    OrcUnion union = new OrcUnion(schema_2);
    union.set(0, expectedStruct);
    expectedStruct_nested_1.setFieldValue("j", union);
    // Construct actualStruct: make sure there's a struct-placeholder within the union.
    actualStruct = (OrcStruct) OrcStruct.createValue(schema_5);
    OrcUnion placeHolderUnion = new OrcUnion(schema_2);
    placeHolderUnion.set(0, placeHolderStruct);
    actualStruct.setFieldValue("j", placeHolderUnion);
    OrcTestUtils.fillOrcStructWithFixedValue(actualStruct, schema_5, 3, "", false);
    Assert.assertEquals(actualStruct, expectedStruct_nested_1);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) BooleanWritable(org.apache.hadoop.io.BooleanWritable) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) Text(org.apache.hadoop.io.Text) OrcUnion(org.apache.orc.mapred.OrcUnion) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 37 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtilsTest method testUpConvertOrcStructOfUnion.

@Test
public void testUpConvertOrcStructOfUnion() {
    // Union in struct, type widening within the union's member field.
    TypeDescription unionInStructSchema = TypeDescription.fromString("struct<a:uniontype<int,string>>");
    OrcStruct unionInStruct = (OrcStruct) OrcStruct.createValue(unionInStructSchema);
    OrcUnion placeHolderUnion = new OrcUnion(TypeDescription.fromString("uniontype<int,string>"));
    placeHolderUnion.set(0, new IntWritable(1));
    unionInStruct.setFieldValue("a", placeHolderUnion);
    OrcTestUtils.fillOrcStructWithFixedValue(unionInStruct, unionInStructSchema, intValue1, stringValue1, boolValue);
    // Create new structWithUnion
    TypeDescription evolved_unionInStructSchema = TypeDescription.fromString("struct<a:uniontype<bigint,string>>");
    OrcStruct evolvedUnionInStruct = (OrcStruct) OrcStruct.createValue(evolved_unionInStructSchema);
    OrcUnion evolvedPlaceHolderUnion = new OrcUnion(TypeDescription.fromString("uniontype<bigint,string>"));
    evolvedPlaceHolderUnion.set(0, new LongWritable(1L));
    evolvedUnionInStruct.setFieldValue("a", evolvedPlaceHolderUnion);
    OrcUtils.upConvertOrcStruct(unionInStruct, evolvedUnionInStruct, evolved_unionInStructSchema);
    // Check in the tag 0(Default from value-filler) within evolvedUnionInStruct, the value is becoming type-widened with correct value.
    Assert.assertEquals(((OrcUnion) evolvedUnionInStruct.getFieldValue("a")).getTag(), 0);
    Assert.assertEquals(((OrcUnion) evolvedUnionInStruct.getFieldValue("a")).getObject(), new LongWritable(intValue1));
    // Check the case when union field is created in different tag.
    // Complex: List<Struct> within struct among others and evolution happens on multiple places, also type-widening in deeply nested level.
    TypeDescription complexOrcSchema = TypeDescription.fromString("struct<a:array<struct<a:string,b:int>>,b:struct<a:uniontype<int,string>>>");
    OrcStruct complexOrcStruct = (OrcStruct) OrcUtils.createValueRecursively(complexOrcSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(complexOrcStruct, complexOrcSchema, intValue1, stringValue1, boolValue);
    TypeDescription evolvedComplexOrcSchema = TypeDescription.fromString("struct<a:array<struct<a:string,b:bigint,c:string>>,b:struct<a:uniontype<bigint,string>,b:int>>");
    OrcStruct evolvedComplexStruct = (OrcStruct) OrcUtils.createValueRecursively(evolvedComplexOrcSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(evolvedComplexStruct, evolvedComplexOrcSchema, intValue1, stringValue1, boolValue);
    // Check if new columns are assigned with null value and type widening is working fine.
    OrcUtils.upConvertOrcStruct(complexOrcStruct, evolvedComplexStruct, evolvedComplexOrcSchema);
    Assert.assertEquals(((OrcStruct) ((OrcList) evolvedComplexStruct.getFieldValue("a")).get(0)).getFieldValue("b"), new LongWritable(intValue1));
    Assert.assertNull(((OrcStruct) ((OrcList) evolvedComplexStruct.getFieldValue("a")).get(0)).getFieldValue("c"));
    Assert.assertEquals(((OrcUnion) ((OrcStruct) evolvedComplexStruct.getFieldValue("b")).getFieldValue("a")).getObject(), new LongWritable(intValue1));
    Assert.assertNull(((OrcStruct) evolvedComplexStruct.getFieldValue("b")).getFieldValue("b"));
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) OrcUnion(org.apache.orc.mapred.OrcUnion) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 38 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtilsTest method testOrcStructProjection.

/**
 * Just a sanity test for column project, should be no difference from other cases when provided reader schema.
 */
@Test
public void testOrcStructProjection() {
    TypeDescription originalSchema = TypeDescription.fromString("struct<a:struct<a:int,b:int>,b:struct<c:int,d:int>,c:int>");
    OrcStruct originalStruct = (OrcStruct) OrcUtils.createValueRecursively(originalSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(originalStruct, originalSchema, intValue1, stringValue1, boolValue);
    TypeDescription projectedSchema = TypeDescription.fromString("struct<a:struct<b:int>,b:struct<c:int>>");
    OrcStruct projectedStructExpectedValue = (OrcStruct) OrcUtils.createValueRecursively(projectedSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(projectedStructExpectedValue, projectedSchema, intValue1, stringValue1, boolValue);
    OrcStruct projectColumnStruct = (OrcStruct) OrcUtils.createValueRecursively(projectedSchema);
    OrcUtils.upConvertOrcStruct(originalStruct, projectColumnStruct, projectedSchema);
    Assert.assertEquals(projectColumnStruct, projectedStructExpectedValue);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription) Test(org.testng.annotations.Test)

Example 39 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtilsTest method testUpConvertOrcStructOfList.

@Test
public void testUpConvertOrcStructOfList() {
    // Simple Nested: List within Struct.
    // The element type of list contains a new field.
    // Prepare two ListInStructs with different size ( the list field contains different number of members)
    TypeDescription structOfListSchema = TypeDescription.fromString("struct<a:array<struct<a:int,b:string>>>");
    OrcStruct structOfList = (OrcStruct) OrcUtils.createValueRecursively(structOfListSchema);
    // Create an OrcList instance with two entries
    TypeDescription innerStructSchema = TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", TypeDescription.createString());
    OrcStruct innerStruct1 = new OrcStruct(innerStructSchema);
    innerStruct1.setFieldValue("a", new IntWritable(intValue1));
    innerStruct1.setFieldValue("b", new Text(stringValue1));
    OrcStruct innerStruct2 = new OrcStruct(innerStructSchema);
    innerStruct2.setFieldValue("a", new IntWritable(intValue2));
    innerStruct2.setFieldValue("b", new Text(stringValue2));
    TypeDescription listSchema = TypeDescription.createList(innerStructSchema);
    OrcList orcList = new OrcList(listSchema);
    orcList.add(innerStruct1);
    orcList.add(innerStruct2);
    structOfList.setFieldValue("a", orcList);
    TypeDescription evolvedStructOfListSchema = TypeDescription.fromString("struct<a:array<struct<a:int,b:string,c:int>>>");
    OrcStruct evolvedStructOfList = (OrcStruct) OrcUtils.createValueRecursively(evolvedStructOfListSchema);
    // Convert and verify contents.
    OrcUtils.upConvertOrcStruct(structOfList, evolvedStructOfList, evolvedStructOfListSchema);
    Assert.assertEquals(((IntWritable) ((OrcStruct) ((OrcList) evolvedStructOfList.getFieldValue("a")).get(0)).getFieldValue("a")).get(), intValue1);
    Assert.assertEquals(((OrcStruct) ((OrcList) evolvedStructOfList.getFieldValue("a")).get(0)).getFieldValue("b").toString(), stringValue1);
    Assert.assertNull((((OrcStruct) ((OrcList) evolvedStructOfList.getFieldValue("a")).get(0)).getFieldValue("c")));
    Assert.assertEquals(((IntWritable) ((OrcStruct) ((OrcList) evolvedStructOfList.getFieldValue("a")).get(1)).getFieldValue("a")).get(), intValue2);
    Assert.assertEquals(((OrcStruct) ((OrcList) evolvedStructOfList.getFieldValue("a")).get(1)).getFieldValue("b").toString(), stringValue2);
    Assert.assertNull((((OrcStruct) ((OrcList) evolvedStructOfList.getFieldValue("a")).get(1)).getFieldValue("c")));
    // Create a list in source OrcStruct with 3 elements
    structOfList = (OrcStruct) OrcUtils.createValueRecursively(structOfListSchema, 3);
    OrcTestUtils.fillOrcStructWithFixedValue(structOfList, structOfListSchema, intValue1, stringValue1, boolValue);
    Assert.assertNotEquals(((OrcList) structOfList.getFieldValue("a")).size(), ((OrcList) evolvedStructOfList.getFieldValue("a")).size());
    OrcUtils.upConvertOrcStruct(structOfList, evolvedStructOfList, evolvedStructOfListSchema);
    Assert.assertEquals(((OrcList) evolvedStructOfList.getFieldValue("a")).size(), 3);
    // Original has list.size()=0, target has list.size() = 1
    ((OrcList) structOfList.getFieldValue("a")).clear();
    OrcUtils.upConvertOrcStruct(structOfList, evolvedStructOfList, evolvedStructOfListSchema);
    Assert.assertEquals(((OrcList) evolvedStructOfList.getFieldValue("a")).size(), 0);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) Text(org.apache.hadoop.io.Text) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Aggregations

OrcStruct (org.apache.orc.mapred.OrcStruct)39 TypeDescription (org.apache.orc.TypeDescription)24 Configuration (org.apache.hadoop.conf.Configuration)18 IntWritable (org.apache.hadoop.io.IntWritable)17 Test (org.testng.annotations.Test)15 ArrayList (java.util.ArrayList)13 Test (org.junit.Test)9 OrcFile (org.apache.orc.OrcFile)8 OrcList (org.apache.orc.mapred.OrcList)8 File (java.io.File)7 InputRow (org.apache.druid.data.input.InputRow)7 Path (org.apache.hadoop.fs.Path)7 Job (org.apache.hadoop.mapreduce.Job)7 OrcUnion (org.apache.orc.mapred.OrcUnion)7 ImmutableList (com.google.common.collect.ImmutableList)6 List (java.util.List)6 HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)6 Text (org.apache.hadoop.io.Text)6 OrcKey (org.apache.orc.mapred.OrcKey)5 OrcMap (org.apache.orc.mapred.OrcMap)5