use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.
the class OrcStructConverterTest method testConvertRootFieldWithStructOfNullsReturningStructOfNulls.
@Test
public void testConvertRootFieldWithStructOfNullsReturningStructOfNulls() {
final TypeDescription structType = TypeDescription.createStruct();
structType.addField("int", TypeDescription.createInt());
structType.addField("float", TypeDescription.createFloat());
final OrcStruct orcStruct = new OrcStruct(structType);
orcStruct.setFieldValue("int", null);
orcStruct.setFieldValue("float", null);
final Map<String, Object> expectedResult = new HashMap<>();
expectedResult.put("int", null);
expectedResult.put("float", null);
final OrcStructConverter converter = new OrcStructConverter(false);
assertConversion(converter, structType, expectedResult, orcStruct);
}
use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.
the class OrcStructConverterTest method assertConversion.
private static void assertConversion(OrcStructConverter converter, TypeDescription fieldType, @Nullable Object expectedValueAfterConversion, @Nullable WritableComparable actualValueInOrc) {
final String fieldName = "field";
final TypeDescription schema = createRootSchema(fieldName, fieldType);
final OrcStruct orcStruct = new OrcStruct(schema);
orcStruct.setFieldValue(fieldName, actualValueInOrc);
if (expectedValueAfterConversion != null) {
assertFieldValue(expectedValueAfterConversion, converter, orcStruct, fieldName);
} else {
assertNullValue(converter, orcStruct, fieldName);
}
}
use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.
the class OrcStructConverterTest method testConvertRootFieldWithStructOfNonNullPrimitivesReturningValuesAsTheyAre.
@Test
public void testConvertRootFieldWithStructOfNonNullPrimitivesReturningValuesAsTheyAre() {
final TypeDescription structType = TypeDescription.createStruct();
structType.addField("int", TypeDescription.createInt());
structType.addField("float", TypeDescription.createFloat());
final OrcStruct orcStruct = new OrcStruct(structType);
orcStruct.setFieldValue("int", new IntWritable(10));
orcStruct.setFieldValue("float", new FloatWritable(10.f));
final Map<String, Object> expectedResult = new HashMap<>();
expectedResult.put("int", ((IntWritable) orcStruct.getFieldValue("int")).get());
expectedResult.put("float", ((FloatWritable) orcStruct.getFieldValue("float")).get());
final OrcStructConverter converter = new OrcStructConverter(false);
assertConversion(converter, structType, expectedResult, orcStruct);
}
use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.
the class OrcCompactionTaskTest method testReducerSideDedup.
@Test
public void testReducerSideDedup() throws Exception {
File basePath = Files.createTempDir();
basePath.deleteOnExit();
String minutelyPath = "Identity/MemberAccount/minutely/2020/04/03/10/20_30/run_2020-04-03-10-20";
String hourlyPath = "Identity/MemberAccount/hourly/2020/04/03/10/";
File jobDir = new File(basePath, minutelyPath);
Assert.assertTrue(jobDir.mkdirs());
TypeDescription nestedSchema = TypeDescription.fromString("struct<a:struct<a:int,b:string,c:int>,b:string,c:uniontype<int,string>>");
// Create three records with same value except "b" column in the top-level.
OrcStruct nested_struct_1 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_1, nestedSchema, 1, "test1", true);
((OrcStruct) nested_struct_1).setFieldValue("b", new Text("uno"));
OrcStruct nested_struct_2 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_2, nestedSchema, 1, "test2", true);
((OrcStruct) nested_struct_2).setFieldValue("b", new Text("dos"));
OrcStruct nested_struct_3 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_3, nestedSchema, 1, "test3", true);
((OrcStruct) nested_struct_3).setFieldValue("b", new Text("tres"));
// Create another two records with different value from the above three, and these two differs in column b as well.
OrcStruct nested_struct_4 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_4, nestedSchema, 2, "test2", false);
((OrcStruct) nested_struct_4).setFieldValue("b", new Text("uno"));
// This record will be considered as a duplication as nested_struct_4
OrcStruct nested_struct_5 = (OrcStruct) OrcUtils.createValueRecursively(nestedSchema);
OrcTestUtils.fillOrcStructWithFixedValue(nested_struct_5, nestedSchema, 2, "test2", false);
((OrcStruct) nested_struct_5).setFieldValue("b", new Text("uno"));
// Following pattern: FILENAME.RECORDCOUNT.EXTENSION
File file_0 = new File(jobDir, "file_0.5." + extensionName);
writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), nestedSchema, ImmutableList.of(nested_struct_1, nested_struct_2, nested_struct_3, nested_struct_4, nested_struct_5));
EmbeddedGobblin embeddedGobblin = createEmbeddedGobblinCompactionJob("basic", basePath.getAbsolutePath().toString()).setConfiguration(CompactionJobConfigurator.COMPACTION_JOB_CONFIGURATOR_FACTORY_CLASS_KEY, TestCompactionOrcJobConfigurator.Factory.class.getName()).setConfiguration(COMPACTION_OUTPUT_EXTENSION, extensionName).setConfiguration(ORC_MAPPER_SHUFFLE_KEY_SCHEMA, "struct<a:struct<a:int,c:int>>");
JobExecutionResult execution = embeddedGobblin.run();
Assert.assertTrue(execution.isSuccessful());
// Verifying result: Reducer should catch all the false-duplicates
File outputDir = new File(basePath, hourlyPath);
FileSystem fs = FileSystem.getLocal(new Configuration());
List<FileStatus> statuses = new ArrayList<>();
reloadFolder(statuses, outputDir, fs);
Assert.assertEquals(statuses.size(), 1);
List<OrcStruct> result = readOrcFile(statuses.get(0).getPath());
// Should still contain original 3 records since they have different value in columns not included in shuffle key.
Assert.assertEquals(result.size(), 4);
Assert.assertTrue(result.contains(nested_struct_1));
Assert.assertTrue(result.contains(nested_struct_2));
Assert.assertTrue(result.contains(nested_struct_3));
Assert.assertTrue(result.contains(nested_struct_4));
}
use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.
the class OrcCompactionTaskTest method createTestingData.
private void createTestingData(File jobDir) throws Exception {
// Write some ORC file for compaction here.
TypeDescription schema = TypeDescription.fromString("struct<i:int,j:int>");
OrcStruct orcStruct_0 = (OrcStruct) OrcStruct.createValue(schema);
orcStruct_0.setFieldValue("i", new IntWritable(1));
orcStruct_0.setFieldValue("j", new IntWritable(2));
OrcStruct orcStruct_1 = (OrcStruct) OrcStruct.createValue(schema);
orcStruct_1.setFieldValue("i", new IntWritable(1));
orcStruct_1.setFieldValue("j", new IntWritable(2));
OrcStruct orcStruct_2 = (OrcStruct) OrcStruct.createValue(schema);
orcStruct_2.setFieldValue("i", new IntWritable(2));
orcStruct_2.setFieldValue("j", new IntWritable(3));
OrcStruct orcStruct_3 = (OrcStruct) OrcStruct.createValue(schema);
orcStruct_3.setFieldValue("i", new IntWritable(4));
orcStruct_3.setFieldValue("j", new IntWritable(5));
// Following pattern: FILENAME.RECORDCOUNT.EXTENSION
File file_0 = new File(jobDir, "file_0.2." + extensionName);
File file_1 = new File(jobDir, "file_1.2." + extensionName);
writeOrcRecordsInFile(new Path(file_0.getAbsolutePath()), schema, ImmutableList.of(orcStruct_0, orcStruct_2));
writeOrcRecordsInFile(new Path(file_1.getAbsolutePath()), schema, ImmutableList.of(orcStruct_1, orcStruct_3));
}
Aggregations