Search in sources :

Example 6 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcStructConverterTest method testConvertRootFieldWithStructOfNullsReturningStructOfNulls.

@Test
public void testConvertRootFieldWithStructOfNullsReturningStructOfNulls() {
    final TypeDescription structType = TypeDescription.createStruct();
    structType.addField("int", TypeDescription.createInt());
    structType.addField("float", TypeDescription.createFloat());
    final OrcStruct orcStruct = new OrcStruct(structType);
    orcStruct.setFieldValue("int", null);
    orcStruct.setFieldValue("float", null);
    final Map<String, Object> expectedResult = new HashMap<>();
    expectedResult.put("int", null);
    expectedResult.put("float", null);
    final OrcStructConverter converter = new OrcStructConverter(false);
    assertConversion(converter, structType, expectedResult, orcStruct);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) HashMap(java.util.HashMap) TypeDescription(org.apache.orc.TypeDescription) Test(org.junit.Test)

Example 7 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcStructConverterTest method assertConversion.

private static void assertConversion(OrcStructConverter converter, TypeDescription fieldType, @Nullable Object expectedValueAfterConversion, @Nullable WritableComparable actualValueInOrc) {
    final String fieldName = "field";
    final TypeDescription schema = createRootSchema(fieldName, fieldType);
    final OrcStruct orcStruct = new OrcStruct(schema);
    orcStruct.setFieldValue(fieldName, actualValueInOrc);
    if (expectedValueAfterConversion != null) {
        assertFieldValue(expectedValueAfterConversion, converter, orcStruct, fieldName);
    } else {
        assertNullValue(converter, orcStruct, fieldName);
    }
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription)

Example 8 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcStructConverterTest method testConvertRootFieldWithStructOfNonNullPrimitivesReturningValuesAsTheyAre.

@Test
public void testConvertRootFieldWithStructOfNonNullPrimitivesReturningValuesAsTheyAre() {
    final TypeDescription structType = TypeDescription.createStruct();
    structType.addField("int", TypeDescription.createInt());
    structType.addField("float", TypeDescription.createFloat());
    final OrcStruct orcStruct = new OrcStruct(structType);
    orcStruct.setFieldValue("int", new IntWritable(10));
    orcStruct.setFieldValue("float", new FloatWritable(10.f));
    final Map<String, Object> expectedResult = new HashMap<>();
    expectedResult.put("int", ((IntWritable) orcStruct.getFieldValue("int")).get());
    expectedResult.put("float", ((FloatWritable) orcStruct.getFieldValue("float")).get());
    final OrcStructConverter converter = new OrcStructConverter(false);
    assertConversion(converter, structType, expectedResult, orcStruct);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) FloatWritable(org.apache.hadoop.io.FloatWritable) HashMap(java.util.HashMap) TypeDescription(org.apache.orc.TypeDescription) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 9 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcCompactionTaskTest method testReducerSideDedup.

@Test
public void testReducerSideDedup() throws Exception {
    File basePath = Files.createTempDir();
    basePath.deleteOnExit();
    String minutelyPath = "Identity/MemberAccount/minutely/2020/04/03/10/20_30/run_2020-04-03-10-20";
    String hourlyPath = "Identity/MemberAccount/hourly/2020/04/03/10/";
    File jobDir = new File(basePath, minutelyPath);
    Assert.assertTrue(jobDir.mkdirs());
    TypeDescription nestedSchema = TypeDescription.fromString("struct<a:struct<a:int,b:string,c:int>,b:string,c:uniontype<int,string>>");
    // Create three records with same value except "b" column in the top-level.
    OrcStruct nested_struct_1 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_1, nestedSchema, 1, "test1", true);
    ((OrcStruct) nested_struct_1).setFieldValue("b", new Text("uno"));
    OrcStruct nested_struct_2 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_2, nestedSchema, 1, "test2", true);
    ((OrcStruct) nested_struct_2).setFieldValue("b", new Text("dos"));
    OrcStruct nested_struct_3 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_3, nestedSchema, 1, "test3", true);
    ((OrcStruct) nested_struct_3).setFieldValue("b", new Text("tres"));
    // Create another two records with different value from the above three, and these two differs in column b as well.
    OrcStruct nested_struct_4 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_4, nestedSchema, 2, "test2", false);
    ((OrcStruct) nested_struct_4).setFieldValue("b", new Text("uno"));
    // This record will be considered as a duplication as nested_struct_4
    OrcStruct nested_struct_5 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
    OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_5, nestedSchema, 2, "test2", false);
    ((OrcStruct) nested_struct_5).setFieldValue("b", new Text("uno"));
    // Following pattern: FILENAME.RECORDCOUNT.EXTENSION
    File file_0 = new File(jobDir, "file_0.5." + extensionName);
    writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), nestedSchema, ImmutableList.of(nested_struct_1, nested_struct_2, nested_struct_3, nested_struct_4, nested_struct_5));
    EmbeddedGobblin embeddedGobblin = createEmbeddedGobblinCompactionJob("basic", basePath.getAbsolutePath().toString()).setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY, TestCompactionOrcJobConfigurator.Factory.class.getName()).setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionName).setConfiguration(ORC_MAPPER_SHUFFLE_KEY_SCHEMA, "struct<a:struct<a:int,c:int>>");
    JobExecutionResult execution = embeddedGobblin.run();
    Assert.assertTrue(execution.isSuccessful());
    // Verifying result: Reducer should catch all the false-duplicates
    File outputDir = new File(basePath, hourlyPath);
    FileSystem fs = FileSystem.getLocal(new Configuration());
    List<FileStatus> statuses = new ArrayList<>();
    reloadFolder(statuses, outputDir, fs);
    Assert.assertEquals(statuses.size(), 1);
    List<OrcStruct> result = readOrcFile(statuses.get(0).getPath());
    // Should still contain original 3 records since they have different value in columns not included in shuffle key.
    Assert.assertEquals(result.size(), 4);
    Assert.assertTrue(result.contains(nested_struct_1));
    Assert.assertTrue(result.contains(nested_struct_2));
    Assert.assertTrue(result.contains(nested_struct_3));
    Assert.assertTrue(result.contains(nested_struct_4));
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) JobExecutionResult(org.apache.gobblin.runtime.api.JobExecutionResult) OrcStruct(org.apache.orc.mapred.OrcStruct) FileSystem(org.apache.hadoop.fs.FileSystem) TypeDescription(org.apache.orc.TypeDescription) EmbeddedGobblin(org.apache.gobblin.runtime.embedded.EmbeddedGobblin) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Test(org.testng.annotations.Test)

Example 10 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcCompactionTaskTest method createTestingData.

private void createTestingData(File jobDir) throws Exception {
    // Write some ORC file for compaction here.
    TypeDescription schema = TypeDescription.fromString("struct<i:int,j:int>");
    OrcStruct orcStruct_0 = (OrcStruct) OrcStruct.createValue(schema);
    orcStruct_0.setFieldValue("i", new IntWritable(1));
    orcStruct_0.setFieldValue("j", new IntWritable(2));
    OrcStruct orcStruct_1 = (OrcStruct) OrcStruct.createValue(schema);
    orcStruct_1.setFieldValue("i", new IntWritable(1));
    orcStruct_1.setFieldValue("j", new IntWritable(2));
    OrcStruct orcStruct_2 = (OrcStruct) OrcStruct.createValue(schema);
    orcStruct_2.setFieldValue("i", new IntWritable(2));
    orcStruct_2.setFieldValue("j", new IntWritable(3));
    OrcStruct orcStruct_3 = (OrcStruct) OrcStruct.createValue(schema);
    orcStruct_3.setFieldValue("i", new IntWritable(4));
    orcStruct_3.setFieldValue("j", new IntWritable(5));
    // Following pattern: FILENAME.RECORDCOUNT.EXTENSION
    File file_0 = new File(jobDir, "file_0.2." + extensionName);
    File file_1 = new File(jobDir, "file_1.2." + extensionName);
    writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), schema, ImmutableList.of(orcStruct_0, orcStruct_2));
    writeOrcRecordsInFile(new Path(file_1.getAbsolutePath()), schema, ImmutableList.of(orcStruct_1, orcStruct_3));
}
Also used : Path(org.apache.hadoop.fs.Path) OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription) OrcFile(org.apache.orc.OrcFile) File(java.io.File) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

OrcStruct (org.apache.orc.mapred.OrcStruct)39 TypeDescription (org.apache.orc.TypeDescription)24 Configuration (org.apache.hadoop.conf.Configuration)18 IntWritable (org.apache.hadoop.io.IntWritable)17 Test (org.testng.annotations.Test)15 ArrayList (java.util.ArrayList)13 Test (org.junit.Test)9 OrcFile (org.apache.orc.OrcFile)8 OrcList (org.apache.orc.mapred.OrcList)8 File (java.io.File)7 InputRow (org.apache.druid.data.input.InputRow)7 Path (org.apache.hadoop.fs.Path)7 Job (org.apache.hadoop.mapreduce.Job)7 OrcUnion (org.apache.orc.mapred.OrcUnion)7 ImmutableList (com.google.common.collect.ImmutableList)6 List (java.util.List)6 HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)6 Text (org.apache.hadoop.io.Text)6 OrcKey (org.apache.orc.mapred.OrcKey)5 OrcMap (org.apache.orc.mapred.OrcMap)5