Search in sources :

Example 11 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcCompactionTaskTest method writeOrcRecordsInFile.

private void writeOrcRecordsInFile(Path path, TypeDescription schema, List<OrcStruct> orcStructs) throws Exception {
    Configuration configuration = new Configuration();
    OrcFile.WriterOptions options = OrcFile.writerOptions(configuration).setSchema(schema);
    Writer writer = OrcFile.createWriter(path, options);
    OrcMapreduceRecordWriter recordWriter = new OrcMapreduceRecordWriter(writer);
    for (OrcStruct orcRecord : orcStructs) {
        recordWriter.write(NullWritable.get(), orcRecord);
    }
    recordWriter.close(new TaskAttemptContextImpl(configuration, new TaskAttemptID()));
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) OrcMapreduceRecordWriter(org.apache.orc.mapreduce.OrcMapreduceRecordWriter) OrcFile(org.apache.orc.OrcFile) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) OrcMapreduceRecordWriter(org.apache.orc.mapreduce.OrcMapreduceRecordWriter) Writer(org.apache.orc.Writer)

Example 12 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcCompactionTaskTest method basicTestWithShuffleKeySpecified.

@Test
public void basicTestWithShuffleKeySpecified() throws Exception {
    File basePath = Files.createTempDir();
    basePath.deleteOnExit();
    String minutelyPath = "Identity/MemberAccount/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20";
    String hourlyPath = "Identity/MemberAccount/hourly/2017/04/03/10/";
    File jobDir = new File(basePath, minutelyPath);
    Assert.assertTrue(jobDir.mkdirs());
    // Writing some basic ORC files
    // Testing data is schema'ed with "struct<i:int,j:int>"
    createTestingData(jobDir);
    EmbeddedGobblin embeddedGobblin = TestCompactionTaskUtils.createEmbeddedGobblinCompactionJob("basic", basePath.getAbsolutePath()).setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY, TestCompactionOrcJobConfigurator.Factory.class.getName()).setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionName).setConfiguration(ORC_MAPPER_SHUFFLE_KEY_SCHEMA, "struct<k:int>");
    JobExecutionResult execution = embeddedGobblin.run();
    Assert.assertTrue(execution.isSuccessful());
    // Result verification
    File outputDir = new File(basePath, hourlyPath);
    FileSystem fs = FileSystem.getLocal(new Configuration());
    List<FileStatus> statuses = new ArrayList<>();
    reloadFolder(statuses, outputDir, fs);
    Assert.assertTrue(statuses.size() == 1);
    List<OrcStruct> result = readOrcFile(statuses.get(0).getPath());
    Assert.assertEquals(result.size(), 3);
    Assert.assertEquals(result.get(0).getFieldValue("i"), new IntWritable(1));
    Assert.assertEquals(result.get(0).getFieldValue("j"), new IntWritable(2));
    Assert.assertEquals(result.get(1).getFieldValue("i"), new IntWritable(2));
    Assert.assertEquals(result.get(1).getFieldValue("j"), new IntWritable(3));
    Assert.assertEquals(result.get(2).getFieldValue("i"), new IntWritable(4));
    Assert.assertEquals(result.get(2).getFieldValue("j"), new IntWritable(5));
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) JobExecutionResult(org.apache.gobblin.runtime.api.JobExecutionResult) OrcStruct(org.apache.orc.mapred.OrcStruct) FileSystem(org.apache.hadoop.fs.FileSystem) EmbeddedGobblin(org.apache.gobblin.runtime.embedded.EmbeddedGobblin) OrcFile(org.apache.orc.OrcFile) File(java.io.File) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 13 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcCompactionTaskTest method testNonDedup.

@Test
public void testNonDedup() throws Exception {
    File basePath = Files.createTempDir();
    basePath.deleteOnExit();
    String minutelyPath = "Identity/MemberAccount_2/minutely/2017/04/03/10/20_30/run_2017-04-03-10-20";
    String hourlyPath = "Identity/MemberAccount_2/hourly/2017/04/03/10/";
    File jobDir = new File(basePath, minutelyPath);
    Assert.assertTrue(jobDir.mkdirs());
    createTestingData(jobDir);
    EmbeddedGobblin embeddedGobblin_nondedup = createEmbeddedGobblinCompactionJob("basic", basePath.getAbsolutePath().toString()).setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY, TestCompactionOrcJobConfigurator.Factory.class.getName()).setConfiguration(COMPACTION_OUTPUT_EXTENSION, "orc").setConfiguration(COMPACTION_SHOULD_DEDUPLICATE, "false");
    JobExecutionResult execution = embeddedGobblin_nondedup.run();
    Assert.assertTrue(execution.isSuccessful());
    // Non-dedup result verification
    File outputDir = new File(basePath, hourlyPath);
    FileSystem fs = FileSystem.getLocal(new Configuration());
    List<FileStatus> statuses = new ArrayList<>();
    for (FileStatus status : fs.listStatus(new Path(outputDir.getAbsolutePath()), new PathFilter() {

        @Override
        public boolean accept(Path path) {
            return FilenameUtils.isExtension(path.getName(), "orc");
        }
    })) {
        statuses.add(status);
    }
    Assert.assertTrue(statuses.size() == 1);
    List<OrcStruct> result = readOrcFile(statuses.get(0).getPath());
    Assert.assertEquals(result.size(), 4);
    result.sort(new Comparator<OrcStruct>() {

        @Override
        public int compare(OrcStruct o1, OrcStruct o2) {
            return o1.compareTo(o2);
        }
    });
    Assert.assertEquals(result.get(0).getFieldValue("i"), new IntWritable(1));
    Assert.assertEquals(result.get(0).getFieldValue("j"), new IntWritable(2));
    Assert.assertEquals(result.get(1).getFieldValue("i"), new IntWritable(1));
    Assert.assertEquals(result.get(1).getFieldValue("j"), new IntWritable(2));
    Assert.assertEquals(result.get(2).getFieldValue("i"), new IntWritable(2));
    Assert.assertEquals(result.get(2).getFieldValue("j"), new IntWritable(3));
    Assert.assertEquals(result.get(3).getFieldValue("i"), new IntWritable(4));
    Assert.assertEquals(result.get(3).getFieldValue("j"), new IntWritable(5));
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) JobExecutionResult(org.apache.gobblin.runtime.api.JobExecutionResult) OrcStruct(org.apache.orc.mapred.OrcStruct) FileSystem(org.apache.hadoop.fs.FileSystem) EmbeddedGobblin(org.apache.gobblin.runtime.embedded.EmbeddedGobblin) OrcFile(org.apache.orc.OrcFile) File(java.io.File) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 14 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcKeyComparatorTest method testComplexRecordArray.

@Test
public void testComplexRecordArray() throws Exception {
    OrcKeyComparator comparator = new OrcKeyComparator();
    Configuration conf = new Configuration();
    TypeDescription listSchema = TypeDescription.createList(TypeDescription.createString());
    TypeDescription schema = TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", listSchema);
    conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
    Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
    comparator.setConf(conf);
    // base record
    OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
    record0.setFieldValue("a", new IntWritable(1));
    OrcList orcList0 = createOrcList(3, listSchema, 3);
    record0.setFieldValue("b", orcList0);
    // the same as base but different object, expecting equal to each other.
    OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
    record1.setFieldValue("a", new IntWritable(1));
    OrcList orcList1 = createOrcList(3, listSchema, 3);
    record1.setFieldValue("b", orcList1);
    // Diff in int field
    OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
    record2.setFieldValue("a", new IntWritable(2));
    OrcList orcList2 = createOrcList(3, listSchema, 3);
    record2.setFieldValue("b", orcList2);
    // Diff in array field: 1
    OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
    record3.setFieldValue("a", new IntWritable(1));
    OrcList orcList3 = createOrcList(3, listSchema, 5);
    record3.setFieldValue("b", orcList3);
    // Diff in array field: 2
    OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
    record4.setFieldValue("a", new IntWritable(1));
    OrcList orcList4 = createOrcList(4, listSchema, 3);
    record4.setFieldValue("b", orcList4);
    OrcKey orcKey0 = new OrcKey();
    orcKey0.key = record0;
    OrcKey orcKey1 = new OrcKey();
    orcKey1.key = record1;
    OrcKey orcKey2 = new OrcKey();
    orcKey2.key = record2;
    OrcKey orcKey3 = new OrcKey();
    orcKey3.key = record3;
    OrcKey orcKey4 = new OrcKey();
    orcKey4.key = record4;
    Assert.assertTrue(comparator.compare(orcKey0, orcKey1) == 0);
    Assert.assertTrue(comparator.compare(orcKey1, orcKey2) < 0);
    Assert.assertTrue(comparator.compare(orcKey1, orcKey3) < 0);
    Assert.assertTrue(comparator.compare(orcKey1, orcKey4) < 0);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) OrcKey(org.apache.orc.mapred.OrcKey) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.testng.annotations.Test)

Example 15 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcKeyComparatorTest method createSimpleOrcStruct.

private OrcStruct createSimpleOrcStruct(TypeDescription structSchema, int value1, int value2) {
    OrcStruct result = new OrcStruct(structSchema);
    result.setFieldValue(0, new IntWritable(value1));
    result.setFieldValue(1, new IntWritable(value2));
    return result;
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

OrcStruct (org.apache.orc.mapred.OrcStruct)39 TypeDescription (org.apache.orc.TypeDescription)24 Configuration (org.apache.hadoop.conf.Configuration)18 IntWritable (org.apache.hadoop.io.IntWritable)17 Test (org.testng.annotations.Test)15 ArrayList (java.util.ArrayList)13 Test (org.junit.Test)9 OrcFile (org.apache.orc.OrcFile)8 OrcList (org.apache.orc.mapred.OrcList)8 File (java.io.File)7 InputRow (org.apache.druid.data.input.InputRow)7 Path (org.apache.hadoop.fs.Path)7 Job (org.apache.hadoop.mapreduce.Job)7 OrcUnion (org.apache.orc.mapred.OrcUnion)7 ImmutableList (com.google.common.collect.ImmutableList)6 List (java.util.List)6 HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)6 Text (org.apache.hadoop.io.Text)6 OrcKey (org.apache.orc.mapred.OrcKey)5 OrcMap (org.apache.orc.mapred.OrcMap)5