Search in sources :

Example 31 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcCompactionTaskTest method readOrcFile.

/**
 * Read a output ORC compacted file into memory.
 * This only works if fields are int value.
 */
private List<OrcStruct> readOrcFile(Path orcFilePath) throws IOException, InterruptedException {
    ReaderImpl orcReader = new ReaderImpl(orcFilePath, new OrcFile.ReaderOptions(new Configuration()));
    Reader.Options options = new Reader.Options().schema(orcReader.getSchema());
    OrcMapreduceRecordReader recordReader = new OrcMapreduceRecordReader(orcReader, options);
    List<OrcStruct> result = new ArrayList<>();
    OrcStruct recordContainer;
    while (recordReader.nextKeyValue()) {
        recordContainer = (OrcStruct) OrcUtils.createValueRecursively(orcReader.getSchema());
        OrcUtils.upConvertOrcStruct((OrcStruct) recordReader.getCurrentValue(), recordContainer, orcReader.getSchema());
        result.add(recordContainer);
    }
    return result;
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) OrcFile(org.apache.orc.OrcFile) ArrayList(java.util.ArrayList) Reader(org.apache.orc.Reader) OrcMapreduceRecordReader(org.apache.orc.mapreduce.OrcMapreduceRecordReader) OrcMapreduceRecordReader(org.apache.orc.mapreduce.OrcMapreduceRecordReader) ReaderImpl(org.apache.orc.impl.ReaderImpl)

Example 32 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcKeyComparatorTest method testSimpleComparator.

@Test
public void testSimpleComparator() throws Exception {
    OrcKeyComparator comparator = new OrcKeyComparator();
    Configuration conf = new Configuration();
    String orcSchema = "struct<i:int,j:int>";
    TypeDescription schema = TypeDescription.fromString(orcSchema);
    conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), orcSchema);
    Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), orcSchema);
    comparator.setConf(conf);
    OrcStruct record0 = createSimpleOrcStruct(schema, 1, 2);
    OrcStruct record1 = createSimpleOrcStruct(schema, 3, 4);
    OrcStruct record2 = createSimpleOrcStruct(schema, 3, 4);
    OrcKey orcKey0 = new OrcKey();
    orcKey0.key = record0;
    OrcKey orcKey1 = new OrcKey();
    orcKey1.key = record1;
    OrcKey orcKey2 = new OrcKey();
    orcKey2.key = record2;
    Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
    Assert.assertTrue(comparator.compare(orcKey1, orcKey2) == 0);
    Assert.assertTrue(comparator.compare(orcKey1, orcKey0) > 0);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) TypeDescription(org.apache.orc.TypeDescription) OrcKey(org.apache.orc.mapred.OrcKey) Test(org.testng.annotations.Test)

Example 33 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcKeyComparatorTest method testComplexRecordUnion.

// Test comparison for union containing complex types and nested record inside.
// Schema: struct<a:int,
// b:uniontype<int,
// array<string>,
// struct<x:int,y:int>
// >
// >
@Test
public void testComplexRecordUnion() throws Exception {
    OrcKeyComparator comparator = new OrcKeyComparator();
    Configuration conf = new Configuration();
    TypeDescription listSchema = TypeDescription.createList(TypeDescription.createString());
    TypeDescription nestedRecordSchema = TypeDescription.createStruct().addField("x", TypeDescription.createInt()).addField("y", TypeDescription.createInt());
    TypeDescription unionSchema = TypeDescription.createUnion().addUnionChild(TypeDescription.createInt()).addUnionChild(listSchema).addUnionChild(nestedRecordSchema);
    TypeDescription schema = TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", unionSchema);
    conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
    Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
    comparator.setConf(conf);
    // base record
    OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
    record0.setFieldValue("a", new IntWritable(1));
    OrcStruct nestedRecord0 = createSimpleOrcStruct(nestedRecordSchema, 1, 2);
    OrcUnion orcUnion0 = createOrcUnion(unionSchema, nestedRecord0);
    record0.setFieldValue("b", orcUnion0);
    // same content as base record in diff objects.
    OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
    record1.setFieldValue("a", new IntWritable(1));
    OrcStruct nestedRecord1 = createSimpleOrcStruct(nestedRecordSchema, 1, 2);
    OrcUnion orcUnion1 = createOrcUnion(unionSchema, nestedRecord1);
    record1.setFieldValue("b", orcUnion1);
    // diff records inside union, record0 == record1 < 2
    OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
    record2.setFieldValue("a", new IntWritable(1));
    OrcStruct nestedRecord2 = createSimpleOrcStruct(nestedRecordSchema, 2, 2);
    OrcUnion orcUnion2 = createOrcUnion(unionSchema, nestedRecord2);
    record2.setFieldValue("b", orcUnion2);
    // differ in list inside union, record3 < record4 == record5
    OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
    record3.setFieldValue("a", new IntWritable(1));
    OrcList orcList3 = createOrcList(5, listSchema, 2);
    OrcUnion orcUnion3 = createOrcUnion(unionSchema, orcList3);
    record3.setFieldValue("b", orcUnion3);
    OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
    record4.setFieldValue("a", new IntWritable(1));
    OrcList orcList4 = createOrcList(6, listSchema, 2);
    OrcUnion orcUnion4 = createOrcUnion(unionSchema, orcList4);
    record4.setFieldValue("b", orcUnion4);
    OrcStruct record5 = (OrcStruct) OrcStruct.createValue(schema);
    record5.setFieldValue("a", new IntWritable(1));
    OrcList orcList5 = createOrcList(6, listSchema, 2);
    OrcUnion orcUnion5 = createOrcUnion(unionSchema, orcList5);
    record5.setFieldValue("b", orcUnion5);
    OrcKey orcKey0 = new OrcKey();
    orcKey0.key = record0;
    OrcKey orcKey1 = new OrcKey();
    orcKey1.key = record1;
    OrcKey orcKey2 = new OrcKey();
    orcKey2.key = record2;
    OrcKey orcKey3 = new OrcKey();
    orcKey3.key = record3;
    OrcKey orcKey4 = new OrcKey();
    orcKey4.key = record4;
    OrcKey orcKey5 = new OrcKey();
    orcKey5.key = record5;
    Assert.assertEquals(orcUnion0, orcUnion1);
    // Int value in orcKey2 is larger
    Assert.assertTrue(comparator.compare(orcKey0, orcKey2) < 0);
    Assert.assertTrue(comparator.compare(orcKey3, orcKey4) < 0);
    Assert.assertTrue(comparator.compare(orcKey3, orcKey5) < 0);
    Assert.assertTrue(comparator.compare(orcKey4, orcKey5) == 0);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) OrcUnion(org.apache.orc.mapred.OrcUnion) OrcKey(org.apache.orc.mapred.OrcKey) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 34 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcKeyComparatorTest method testComplexRecordMap.

@Test
public void testComplexRecordMap() throws Exception {
    OrcKeyComparator comparator = new OrcKeyComparator();
    Configuration conf = new Configuration();
    TypeDescription mapFieldSchema = TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createString());
    TypeDescription schema = TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", mapFieldSchema);
    conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
    Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
    comparator.setConf(conf);
    // base record
    OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
    record0.setFieldValue("a", new IntWritable(1));
    OrcMap orcMap = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
    record0.setFieldValue("b", orcMap);
    // key value both differ
    OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
    record1.setFieldValue("a", new IntWritable(1));
    OrcMap orcMap1 = createSimpleOrcMap(new Text("key_key"), new Text("value_value"), mapFieldSchema);
    record1.setFieldValue("b", orcMap1);
    // Key same, value differ
    OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
    record2.setFieldValue("a", new IntWritable(1));
    OrcMap orcMap2 = createSimpleOrcMap(new Text("key"), new Text("value_value"), mapFieldSchema);
    record2.setFieldValue("b", orcMap2);
    // Same as base
    OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
    record3.setFieldValue("a", new IntWritable(1));
    OrcMap orcMap3 = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
    record3.setFieldValue("b", orcMap3);
    // Differ in other field.
    OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
    record4.setFieldValue("a", new IntWritable(2));
    record4.setFieldValue("b", orcMap);
    // Record with map containing multiple entries but inserted in different order.
    OrcStruct record6 = (OrcStruct) OrcStruct.createValue(schema);
    record6.setFieldValue("a", new IntWritable(1));
    OrcMap orcMap6 = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
    orcMap6.put(new Text("keyLater"), new Text("valueLater"));
    record6.setFieldValue("b", orcMap6);
    OrcStruct record7 = (OrcStruct) OrcStruct.createValue(schema);
    record7.setFieldValue("a", new IntWritable(1));
    OrcMap orcMap7 = createSimpleOrcMap(new Text("keyLater"), new Text("valueLater"), mapFieldSchema);
    orcMap7.put(new Text("key"), new Text("value"));
    record7.setFieldValue("b", orcMap7);
    OrcKey orcKey0 = new OrcKey();
    orcKey0.key = record0;
    OrcKey orcKey1 = new OrcKey();
    orcKey1.key = record1;
    OrcKey orcKey2 = new OrcKey();
    orcKey2.key = record2;
    OrcKey orcKey3 = new OrcKey();
    orcKey3.key = record3;
    OrcKey orcKey4 = new OrcKey();
    orcKey4.key = record4;
    OrcKey orcKey6 = new OrcKey();
    orcKey6.key = record6;
    OrcKey orcKey7 = new OrcKey();
    orcKey7.key = record7;
    Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
    Assert.assertTrue(comparator.compare(orcKey1, orcKey2) > 0);
    Assert.assertTrue(comparator.compare(orcKey2, orcKey3) > 0);
    Assert.assertTrue(comparator.compare(orcKey0, orcKey3) == 0);
    Assert.assertTrue(comparator.compare(orcKey0, orcKey4) < 0);
    Assert.assertTrue(comparator.compare(orcKey6, orcKey7) == 0);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) TypeDescription(org.apache.orc.TypeDescription) Text(org.apache.hadoop.io.Text) OrcKey(org.apache.orc.mapred.OrcKey) IntWritable(org.apache.hadoop.io.IntWritable) OrcMap(org.apache.orc.mapred.OrcMap) Test(org.testng.annotations.Test)

Example 35 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtils method structConversionHelper.

/**
 * This method copies value in object {@param w} into object {@param v} recursively even if the schema of w and v
 * differs in a compatible way, meaning if there's a field existing in v but not in w, the null value will be filled.
 * It served as a helper method for {@link #upConvertOrcStruct(OrcStruct, OrcStruct, TypeDescription)} when OrcStruct
 * contains nested structure as a member.
 *
 * Suppress the warning of type checking: All casts are clearly valid as they are all (sub)elements Orc types.
 * Check failure will trigger Cast exception and blow up the process.
 */
@SuppressWarnings("unchecked")
private static WritableComparable structConversionHelper(WritableComparable w, WritableComparable v, TypeDescription targetSchema) {
    if (w instanceof OrcStruct) {
        upConvertOrcStruct((OrcStruct) w, (OrcStruct) v, targetSchema);
    } else if (w instanceof OrcList) {
        OrcList castedList = (OrcList) w;
        OrcList targetList = (OrcList) v;
        TypeDescription elementType = targetSchema.getChildren().get(0);
        targetList.clear();
        for (int i = 0; i < castedList.size(); i++) {
            WritableComparable targetListRecordContainer = createValueRecursively(elementType, 0);
            targetList.add(i, structConversionHelper((WritableComparable) castedList.get(i), targetListRecordContainer, elementType));
        }
    } else if (w instanceof OrcMap) {
        OrcMap castedMap = (OrcMap) w;
        OrcMap targetMap = (OrcMap) v;
        TypeDescription valueSchema = targetSchema.getChildren().get(1);
        targetMap.clear();
        for (Object entry : castedMap.entrySet()) {
            Map.Entry<WritableComparable, WritableComparable> castedEntry = (Map.Entry<WritableComparable, WritableComparable>) entry;
            WritableComparable targetMapRecordContainer = createValueRecursively(valueSchema);
            targetMapRecordContainer = structConversionHelper(castedEntry.getValue(), targetMapRecordContainer, valueSchema);
            targetMap.put(castedEntry.getKey(), targetMapRecordContainer);
        }
    } else if (w instanceof OrcUnion) {
        OrcUnion castedUnion = (OrcUnion) w;
        OrcUnion targetUnion = (OrcUnion) v;
        byte tag = castedUnion.getTag();
        // ORC doesn't support Union type widening
        // Avro doesn't allow it either, reference: https://avro.apache.org/docs/current/spec.html#Schema+Resolution
        // As a result, member schema within source and target should be identical.
        TypeDescription targetMemberSchema = targetSchema.getChildren().get(tag);
        targetUnion.set(tag, structConversionHelper((WritableComparable) castedUnion.getObject(), (WritableComparable) OrcUtils.createValueRecursively(targetMemberSchema), targetMemberSchema));
    } else {
        // Regardless whether type-widening is happening or not, this method copy the value of w into v.
        handlePrimitiveWritableComparable(w, v);
    }
    // If non-primitive or type-widening is required, v should already be populated by w's value recursively.
    return v;
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) OrcList(org.apache.orc.mapred.OrcList) WritableComparable(org.apache.hadoop.io.WritableComparable) TypeDescription(org.apache.orc.TypeDescription) OrcUnion(org.apache.orc.mapred.OrcUnion) HashMap(java.util.HashMap) Map(java.util.Map) OrcMap(org.apache.orc.mapred.OrcMap) OrcMap(org.apache.orc.mapred.OrcMap)

Aggregations

OrcStruct (org.apache.orc.mapred.OrcStruct)39 TypeDescription (org.apache.orc.TypeDescription)24 Configuration (org.apache.hadoop.conf.Configuration)18 IntWritable (org.apache.hadoop.io.IntWritable)17 Test (org.testng.annotations.Test)15 ArrayList (java.util.ArrayList)13 Test (org.junit.Test)9 OrcFile (org.apache.orc.OrcFile)8 OrcList (org.apache.orc.mapred.OrcList)8 File (java.io.File)7 InputRow (org.apache.druid.data.input.InputRow)7 Path (org.apache.hadoop.fs.Path)7 Job (org.apache.hadoop.mapreduce.Job)7 OrcUnion (org.apache.orc.mapred.OrcUnion)7 ImmutableList (com.google.common.collect.ImmutableList)6 List (java.util.List)6 HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)6 Text (org.apache.hadoop.io.Text)6 OrcKey (org.apache.orc.mapred.OrcKey)5 OrcMap (org.apache.orc.mapred.OrcMap)5