Search in sources :

Example 16 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcKeyComparator method setConf.

@Override
public void setConf(Configuration conf) {
    super.setConf(conf);
    if (null != conf) {
        // The MapReduce framework will be using this comparator to sort OrcKey objects
        // output from the map phase, so use the schema defined for the map output key
        // and the data model non-raw compare() implementation.
        schema = TypeDescription.fromString(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()));
        OrcStruct orcRecordModel1 = (OrcStruct) OrcStruct.createValue(schema);
        OrcStruct orcRecordModel2 = (OrcStruct) OrcStruct.createValue(schema);
        if (key1 == null) {
            key1 = new OrcKey();
        }
        if (key2 == null) {
            key2 = new OrcKey();
        }
        if (buffer == null) {
            buffer = new DataInputBuffer();
        }
        key1.key = orcRecordModel1;
        key2.key = orcRecordModel2;
    }
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) OrcKey(org.apache.orc.mapred.OrcKey)

Example 17 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcTestUtils method fillOrcStructWithFixedValue.

/**
 * Fill in value in OrcStruct with given schema, assuming {@param w} contains the same schema as {@param schema}.
 * {@param schema} is still necessary to given given {@param w} do contains schema information itself, because the
 * actual value type is only available in {@link TypeDescription} but not {@link org.apache.orc.mapred.OrcValue}.
 *
 * For simplicity here are some assumptions:
 * - We only give 3 primitive values and use them to construct compound values. To make it work for different types that
 * can be widened or shrunk to each other, please use value within small range.
 * - For List, Map or Union, make sure there's at least one entry within the record-container.
 * you may want to try createValueRecursively(TypeDescription) instead of {@link OrcStruct#createValue(TypeDescription)}
 */
public static void fillOrcStructWithFixedValue(WritableComparable w, TypeDescription schema, int unionTag, int intValue, String stringValue, boolean booleanValue) {
    switch(schema.getCategory()) {
        case BOOLEAN:
            ((BooleanWritable) w).set(booleanValue);
            break;
        case BYTE:
            ((ByteWritable) w).set((byte) intValue);
            break;
        case SHORT:
            ((ShortWritable) w).set((short) intValue);
            break;
        case INT:
            ((IntWritable) w).set(intValue);
            break;
        case LONG:
            ((LongWritable) w).set(intValue);
            break;
        case FLOAT:
            ((FloatWritable) w).set(intValue * 1.0f);
            break;
        case DOUBLE:
            ((DoubleWritable) w).set(intValue * 1.0);
            break;
        case STRING:
        case CHAR:
        case VARCHAR:
            ((Text) w).set(stringValue);
            break;
        case BINARY:
            throw new UnsupportedOperationException("Binary type is not supported in random orc data filler");
        case DECIMAL:
            throw new UnsupportedOperationException("Decimal type is not supported in random orc data filler");
        case DATE:
        case TIMESTAMP:
        case TIMESTAMP_INSTANT:
            throw new UnsupportedOperationException("Timestamp and its derived types is not supported in random orc data filler");
        case LIST:
            OrcList castedList = (OrcList) w;
            // to at least contain one element, or the traversing within the list will be skipped.
            for (Object i : castedList) {
                fillOrcStructWithFixedValue((WritableComparable) i, schema.getChildren().get(0), unionTag, intValue, stringValue, booleanValue);
            }
            break;
        case MAP:
            OrcMap castedMap = (OrcMap) w;
            for (Object entry : castedMap.entrySet()) {
                Map.Entry<WritableComparable, WritableComparable> castedEntry = (Map.Entry<WritableComparable, WritableComparable>) entry;
                fillOrcStructWithFixedValue(castedEntry.getKey(), schema.getChildren().get(0), unionTag, intValue, stringValue, booleanValue);
                fillOrcStructWithFixedValue(castedEntry.getValue(), schema.getChildren().get(1), unionTag, intValue, stringValue, booleanValue);
            }
            break;
        case STRUCT:
            OrcStruct castedStruct = (OrcStruct) w;
            int fieldIdx = 0;
            for (TypeDescription child : schema.getChildren()) {
                fillOrcStructWithFixedValue(castedStruct.getFieldValue(fieldIdx), child, unionTag, intValue, stringValue, booleanValue);
                fieldIdx += 1;
            }
            break;
        case UNION:
            OrcUnion castedUnion = (OrcUnion) w;
            TypeDescription targetMemberSchema = schema.getChildren().get(unionTag);
            castedUnion.set(unionTag, OrcUtils.createValueRecursively(targetMemberSchema));
            fillOrcStructWithFixedValue((WritableComparable) castedUnion.getObject(), targetMemberSchema, unionTag, intValue, stringValue, booleanValue);
            break;
        default:
            throw new IllegalArgumentException("Unknown type " + schema.toString());
    }
}
Also used : DoubleWritable(org.apache.hadoop.io.DoubleWritable) Text(org.apache.hadoop.io.Text) ShortWritable(org.apache.hadoop.io.ShortWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) OrcStruct(org.apache.orc.mapred.OrcStruct) BooleanWritable(org.apache.hadoop.io.BooleanWritable) OrcList(org.apache.orc.mapred.OrcList) WritableComparable(org.apache.hadoop.io.WritableComparable) TypeDescription(org.apache.orc.TypeDescription) LongWritable(org.apache.hadoop.io.LongWritable) OrcUnion(org.apache.orc.mapred.OrcUnion) ByteWritable(org.apache.hadoop.io.ByteWritable) Map(java.util.Map) OrcMap(org.apache.orc.mapred.OrcMap) IntWritable(org.apache.hadoop.io.IntWritable) OrcMap(org.apache.orc.mapred.OrcMap)

Example 18 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtilsTest method testNestedWithinUnionWithDiffTag.

@Test
public void testNestedWithinUnionWithDiffTag() {
    // Construct union type with different tag for the src object dest object, check if up-convert happens correctly.
    TypeDescription structInUnionAsStruct = TypeDescription.fromString("struct<a:uniontype<struct<a:int,b:string>,int>>");
    OrcStruct structInUnionAsStructObject = (OrcStruct) OrcUtils.createValueRecursively(structInUnionAsStruct);
    OrcTestUtils.fillOrcStructWithFixedValue(structInUnionAsStructObject, structInUnionAsStruct, 0, intValue1, stringValue1, boolValue);
    Assert.assertEquals(((OrcStruct) ((OrcUnion) structInUnionAsStructObject.getFieldValue("a")).getObject()).getFieldValue("a"), new IntWritable(intValue1));
    OrcStruct structInUnionAsStructObject_2 = (OrcStruct) OrcUtils.createValueRecursively(structInUnionAsStruct);
    OrcTestUtils.fillOrcStructWithFixedValue(structInUnionAsStructObject_2, structInUnionAsStruct, 1, intValue1, stringValue1, boolValue);
    Assert.assertEquals(((OrcUnion) structInUnionAsStructObject_2.getFieldValue("a")).getObject(), new IntWritable(intValue1));
    // Create a new record container, do up-convert twice and check if the value is propagated properly.
    OrcStruct container = (OrcStruct) OrcUtils.createValueRecursively(structInUnionAsStruct);
    OrcUtils.upConvertOrcStruct(structInUnionAsStructObject, container, structInUnionAsStruct);
    Assert.assertEquals(structInUnionAsStructObject, container);
    OrcUtils.upConvertOrcStruct(structInUnionAsStructObject_2, container, structInUnionAsStruct);
    Assert.assertEquals(structInUnionAsStructObject_2, container);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription) OrcUnion(org.apache.orc.mapred.OrcUnion) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 19 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtilsTest method testUpConvertSimpleOrcStruct.

@Test
public void testUpConvertSimpleOrcStruct() {
    // Basic case, all primitives, newly added value will be set to null
    TypeDescription baseStructSchema = TypeDescription.fromString("struct<a:int,b:string>");
    // This would be re-used in the following tests as the actual record using the schema.
    OrcStruct baseStruct = (OrcStruct) OrcStruct.createValue(baseStructSchema);
    // Fill in the baseStruct with specified value.
    OrcTestUtils.fillOrcStructWithFixedValue(baseStruct, baseStructSchema, intValue1, stringValue1, boolValue);
    TypeDescription evolved_baseStructSchema = TypeDescription.fromString("struct<a:int,b:string,c:int>");
    OrcStruct evolvedStruct = (OrcStruct) OrcStruct.createValue(evolved_baseStructSchema);
    // This should be equivalent to deserialize(baseStruct).serialize(evolvedStruct, evolvedSchema);
    OrcUtils.upConvertOrcStruct(baseStruct, evolvedStruct, evolved_baseStructSchema);
    // Check if all value in baseStruct is populated and newly created column in evolvedStruct is filled with null.
    Assert.assertEquals(((IntWritable) evolvedStruct.getFieldValue("a")).get(), intValue1);
    Assert.assertEquals(evolvedStruct.getFieldValue("b").toString(), stringValue1);
    Assert.assertNull(evolvedStruct.getFieldValue("c"));
    // Base case: Reverse direction, which is column projection on top-level columns.
    OrcStruct baseStruct_shadow = (OrcStruct) OrcStruct.createValue(baseStructSchema);
    OrcUtils.upConvertOrcStruct(evolvedStruct, baseStruct_shadow, baseStructSchema);
    Assert.assertEquals(baseStruct, baseStruct_shadow);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription) Test(org.testng.annotations.Test)

Example 20 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtilsTest method testUpConvertOrcStructOfMap.

@Test
public void testUpConvertOrcStructOfMap() {
    // Map within Struct, contains a type-widening in the map-value type.
    TypeDescription structOfMapSchema = TypeDescription.fromString("struct<a:map<string,int>>");
    OrcStruct structOfMap = (OrcStruct) OrcStruct.createValue(structOfMapSchema);
    TypeDescription mapSchema = TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createInt());
    OrcMap testMap = new OrcMap(mapSchema);
    // Add dummy entries to initialize the testMap. The actual keys and values will be set later.
    testMap.put(new Text(stringValue1), new IntWritable(intValue1));
    testMap.put(new Text(stringValue2), new IntWritable(intValue2));
    structOfMap.setFieldValue("a", testMap);
    // Create the target struct with evolved schema
    TypeDescription evolvedStructOfMapSchema = TypeDescription.fromString("struct<a:map<string,bigint>>");
    OrcStruct evolvedStructOfMap = (OrcStruct) OrcStruct.createValue(evolvedStructOfMapSchema);
    OrcMap evolvedMap = new OrcMap(TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createInt()));
    // Initialize a map
    evolvedMap.put(new Text(""), new LongWritable());
    evolvedStructOfMap.setFieldValue("a", evolvedMap);
    // convert and verify: Type-widening is correct, and size of output file is correct.
    OrcUtils.upConvertOrcStruct(structOfMap, evolvedStructOfMap, evolvedStructOfMapSchema);
    Assert.assertEquals(((OrcMap) evolvedStructOfMap.getFieldValue("a")).get(new Text(stringValue1)), new LongWritable(intValue1));
    Assert.assertEquals(((OrcMap) evolvedStructOfMap.getFieldValue("a")).get(new Text(stringValue2)), new LongWritable(intValue2));
    Assert.assertEquals(((OrcMap) evolvedStructOfMap.getFieldValue("a")).size(), 2);
    // re-use the same object but the source struct has fewer member in the map entry.
    testMap.put(new Text(stringValue3), new IntWritable(intValue3));
    // sanity check
    Assert.assertEquals(((OrcMap) structOfMap.getFieldValue("a")).size(), 3);
    OrcUtils.upConvertOrcStruct(structOfMap, evolvedStructOfMap, evolvedStructOfMapSchema);
    Assert.assertEquals(((OrcMap) evolvedStructOfMap.getFieldValue("a")).size(), 3);
    Assert.assertEquals(((OrcMap) evolvedStructOfMap.getFieldValue("a")).get(new Text(stringValue1)), new LongWritable(intValue1));
    Assert.assertEquals(((OrcMap) evolvedStructOfMap.getFieldValue("a")).get(new Text(stringValue2)), new LongWritable(intValue2));
    Assert.assertEquals(((OrcMap) evolvedStructOfMap.getFieldValue("a")).get(new Text(stringValue3)), new LongWritable(intValue3));
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) OrcMap(org.apache.orc.mapred.OrcMap) Test(org.testng.annotations.Test)

Aggregations

OrcStruct (org.apache.orc.mapred.OrcStruct)39 TypeDescription (org.apache.orc.TypeDescription)24 Configuration (org.apache.hadoop.conf.Configuration)18 IntWritable (org.apache.hadoop.io.IntWritable)17 Test (org.testng.annotations.Test)15 ArrayList (java.util.ArrayList)13 Test (org.junit.Test)9 OrcFile (org.apache.orc.OrcFile)8 OrcList (org.apache.orc.mapred.OrcList)8 File (java.io.File)7 InputRow (org.apache.druid.data.input.InputRow)7 Path (org.apache.hadoop.fs.Path)7 Job (org.apache.hadoop.mapreduce.Job)7 OrcUnion (org.apache.orc.mapred.OrcUnion)7 ImmutableList (com.google.common.collect.ImmutableList)6 List (java.util.List)6 HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)6 Text (org.apache.hadoop.io.Text)6 OrcKey (org.apache.orc.mapred.OrcKey)5 OrcMap (org.apache.orc.mapred.OrcMap)5