use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.
the class HadoopIndexTask method run.
@SuppressWarnings("unchecked")
@Override
public TaskStatus run(TaskToolbox toolbox) throws Exception {
final ClassLoader loader = buildClassLoader(toolbox);
boolean determineIntervals = !spec.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
spec = HadoopIngestionSpec.updateSegmentListIfDatasourcePathSpecIsUsed(spec, jsonMapper, new OverlordActionBasedUsedSegmentLister(toolbox));
final String config = invokeForeignLoader("io.druid.indexing.common.task.HadoopIndexTask$HadoopDetermineConfigInnerProcessing", new String[] { toolbox.getObjectMapper().writeValueAsString(spec), toolbox.getConfig().getHadoopWorkingPath(), toolbox.getSegmentPusher().getPathForHadoop() }, loader);
final HadoopIngestionSpec indexerSchema = toolbox.getObjectMapper().readValue(config, HadoopIngestionSpec.class);
// We should have a lock from before we started running only if interval was specified
String version;
if (determineIntervals) {
Interval interval = JodaUtils.umbrellaInterval(JodaUtils.condenseIntervals(indexerSchema.getDataSchema().getGranularitySpec().bucketIntervals().get()));
TaskLock lock = toolbox.getTaskActionClient().submit(new LockAcquireAction(interval));
version = lock.getVersion();
} else {
Iterable<TaskLock> locks = getTaskLocks(toolbox);
final TaskLock myLock = Iterables.getOnlyElement(locks);
version = myLock.getVersion();
}
final String specVersion = indexerSchema.getTuningConfig().getVersion();
if (indexerSchema.getTuningConfig().isUseExplicitVersion()) {
if (specVersion.compareTo(version) < 0) {
version = specVersion;
} else {
log.error("Spec version can not be greater than or equal to the lock version, Spec version: [%s] Lock version: [%s].", specVersion, version);
return TaskStatus.failure(getId());
}
}
log.info("Setting version to: %s", version);
final String segments = invokeForeignLoader("io.druid.indexing.common.task.HadoopIndexTask$HadoopIndexGeneratorInnerProcessing", new String[] { toolbox.getObjectMapper().writeValueAsString(indexerSchema), version }, loader);
if (segments != null) {
List<DataSegment> publishedSegments = toolbox.getObjectMapper().readValue(segments, new TypeReference<List<DataSegment>>() {
});
toolbox.publishSegments(publishedSegments);
return TaskStatus.success(getId());
} else {
return TaskStatus.failure(getId());
}
}
use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.
the class GranularityPathSpecTest method testAddInputPath.
@Test
public void testAddInputPath() throws Exception {
UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, ImmutableList.of(new Interval("2015-11-06T00:00Z/2015-11-07T00:00Z"))), jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
granularityPathSpec.setDataGranularity(Granularities.HOUR);
granularityPathSpec.setFilePattern(".*");
granularityPathSpec.setInputFormat(TextInputFormat.class);
Job job = Job.getInstance();
String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=00");
testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=02");
testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=05");
testFolder.newFile("test/y=2015/m=11/d=06/H=00/file1");
testFolder.newFile("test/y=2015/m=11/d=06/H=02/file2");
testFolder.newFile("test/y=2015/m=11/d=06/H=05/file3");
testFolder.newFile("test/y=2015/m=11/d=06/H=05/file4");
granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
String expected = Joiner.on(",").join(Lists.newArrayList(String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=00/file1"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=02/file2"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file3"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file4")));
Assert.assertEquals("Did not find expected input paths", expected, actual);
}
use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.
the class TaskSerdeTest method testHadoopIndexTaskSerde.
@Test
public void testHadoopIndexTaskSerde() throws Exception {
final HadoopIndexTask task = new HadoopIndexTask(null, new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, null, ImmutableList.of(new Interval("2010-01-01/P1D"))), jsonMapper), new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "bar"), null, null), null), null, null, "blah", jsonMapper, null);
final String json = jsonMapper.writeValueAsString(task);
final HadoopIndexTask task2 = (HadoopIndexTask) jsonMapper.readValue(json, Task.class);
Assert.assertEquals("foo", task.getDataSource());
Assert.assertEquals(task.getId(), task2.getId());
Assert.assertEquals(task.getGroupId(), task2.getGroupId());
Assert.assertEquals(task.getDataSource(), task2.getDataSource());
Assert.assertEquals(task.getSpec().getTuningConfig().getJobProperties(), task2.getSpec().getTuningConfig().getJobProperties());
Assert.assertEquals("blah", task.getClasspathPrefix());
Assert.assertEquals("blah", task2.getClasspathPrefix());
}
use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.
the class OrcIndexGeneratorJobTest method setUp.
@Before
public void setUp() throws Exception {
mapper = HadoopDruidIndexerConfig.JSON_MAPPER;
mapper.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, "hashed"));
dataRoot = temporaryFolder.newFolder("data");
outputRoot = temporaryFolder.newFolder("output");
File dataFile = writeDataToLocalOrcFile(dataRoot, data);
HashMap<String, Object> inputSpec = new HashMap<String, Object>();
inputSpec.put("paths", dataFile.getCanonicalPath());
inputSpec.put("type", "static");
inputSpec.put("inputFormat", "org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat");
config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema(dataSourceName, mapper.convertValue(inputRowParser, Map.class), aggs, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(this.interval)), mapper), new HadoopIOConfig(ImmutableMap.copyOf(inputSpec), null, outputRoot.getCanonicalPath()), new HadoopTuningConfig(outputRoot.getCanonicalPath(), null, null, null, null, null, false, false, false, false, //verifies that set num reducers is ignored
ImmutableMap.of(JobContext.NUM_REDUCES, "0"), false, true, null, true, null, false, false)));
config.setShardSpecs(loadShardSpecs(shardInfoForEachSegment));
config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
}
use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.
the class GranularityPathSpecTest method testIntervalTrimming.
@Test
public void testIntervalTrimming() throws Exception {
UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.ALL, ImmutableList.of(new Interval("2015-01-01T11Z/2015-01-02T05Z"))), jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
granularityPathSpec.setDataGranularity(Granularities.HOUR);
granularityPathSpec.setPathFormat("yyyy/MM/dd/HH");
granularityPathSpec.setFilePattern(".*");
granularityPathSpec.setInputFormat(TextInputFormat.class);
Job job = Job.getInstance();
String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
createFile(testFolder, "test/2015/01/01/00/file1", "test/2015/01/01/10/file2", "test/2015/01/01/18/file3", "test/2015/01/02/00/file1", "test/2015/01/02/03/file2", "test/2015/01/02/05/file3", "test/2015/01/02/07/file4", "test/2015/01/02/09/file5");
granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
String expected = Joiner.on(",").join(Lists.newArrayList(String.format(formatStr, testFolder.getRoot(), "test/2015/01/01/18/file3"), String.format(formatStr, testFolder.getRoot(), "test/2015/01/02/00/file1"), String.format(formatStr, testFolder.getRoot(), "test/2015/01/02/03/file2")));
Assert.assertEquals("Did not find expected input paths", expected, actual);
}
Aggregations