Search in sources :

Example 1 with HadoopIngestionSpec

use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.

the class HadoopIndexTask method run.

@SuppressWarnings("unchecked")
@Override
public TaskStatus run(TaskToolbox toolbox) throws Exception {
    final ClassLoader loader = buildClassLoader(toolbox);
    boolean determineIntervals = !spec.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
    spec = HadoopIngestionSpec.updateSegmentListIfDatasourcePathSpecIsUsed(spec, jsonMapper, new OverlordActionBasedUsedSegmentLister(toolbox));
    final String config = invokeForeignLoader("io.druid.indexing.common.task.HadoopIndexTask$HadoopDetermineConfigInnerProcessing", new String[] { toolbox.getObjectMapper().writeValueAsString(spec), toolbox.getConfig().getHadoopWorkingPath(), toolbox.getSegmentPusher().getPathForHadoop() }, loader);
    final HadoopIngestionSpec indexerSchema = toolbox.getObjectMapper().readValue(config, HadoopIngestionSpec.class);
    // We should have a lock from before we started running only if interval was specified
    String version;
    if (determineIntervals) {
        Interval interval = JodaUtils.umbrellaInterval(JodaUtils.condenseIntervals(indexerSchema.getDataSchema().getGranularitySpec().bucketIntervals().get()));
        TaskLock lock = toolbox.getTaskActionClient().submit(new LockAcquireAction(interval));
        version = lock.getVersion();
    } else {
        Iterable<TaskLock> locks = getTaskLocks(toolbox);
        final TaskLock myLock = Iterables.getOnlyElement(locks);
        version = myLock.getVersion();
    }
    final String specVersion = indexerSchema.getTuningConfig().getVersion();
    if (indexerSchema.getTuningConfig().isUseExplicitVersion()) {
        if (specVersion.compareTo(version) < 0) {
            version = specVersion;
        } else {
            log.error("Spec version can not be greater than or equal to the lock version, Spec version: [%s] Lock version: [%s].", specVersion, version);
            return TaskStatus.failure(getId());
        }
    }
    log.info("Setting version to: %s", version);
    final String segments = invokeForeignLoader("io.druid.indexing.common.task.HadoopIndexTask$HadoopIndexGeneratorInnerProcessing", new String[] { toolbox.getObjectMapper().writeValueAsString(indexerSchema), version }, loader);
    if (segments != null) {
        List<DataSegment> publishedSegments = toolbox.getObjectMapper().readValue(segments, new TypeReference<List<DataSegment>>() {
        });
        toolbox.publishSegments(publishedSegments);
        return TaskStatus.success(getId());
    } else {
        return TaskStatus.failure(getId());
    }
}
Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) OverlordActionBasedUsedSegmentLister(io.druid.indexing.hadoop.OverlordActionBasedUsedSegmentLister) DataSegment(io.druid.timeline.DataSegment) TaskLock(io.druid.indexing.common.TaskLock) LockAcquireAction(io.druid.indexing.common.actions.LockAcquireAction) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Interval(org.joda.time.Interval)

Example 2 with HadoopIngestionSpec

use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.

the class GranularityPathSpecTest method testAddInputPath.

@Test
public void testAddInputPath() throws Exception {
    UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
    HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, ImmutableList.of(new Interval("2015-11-06T00:00Z/2015-11-07T00:00Z"))), jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
    granularityPathSpec.setDataGranularity(Granularities.HOUR);
    granularityPathSpec.setFilePattern(".*");
    granularityPathSpec.setInputFormat(TextInputFormat.class);
    Job job = Job.getInstance();
    String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=00");
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=02");
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=05");
    testFolder.newFile("test/y=2015/m=11/d=06/H=00/file1");
    testFolder.newFile("test/y=2015/m=11/d=06/H=02/file2");
    testFolder.newFile("test/y=2015/m=11/d=06/H=05/file3");
    testFolder.newFile("test/y=2015/m=11/d=06/H=05/file4");
    granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
    granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
    String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
    String expected = Joiner.on(",").join(Lists.newArrayList(String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=00/file1"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=02/file2"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file3"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file4")));
    Assert.assertEquals("Did not find expected input paths", expected, actual);
}
Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) Job(org.apache.hadoop.mapreduce.Job) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 3 with HadoopIngestionSpec

use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.

the class TaskSerdeTest method testHadoopIndexTaskSerde.

@Test
public void testHadoopIndexTaskSerde() throws Exception {
    final HadoopIndexTask task = new HadoopIndexTask(null, new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, null, ImmutableList.of(new Interval("2010-01-01/P1D"))), jsonMapper), new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "bar"), null, null), null), null, null, "blah", jsonMapper, null);
    final String json = jsonMapper.writeValueAsString(task);
    final HadoopIndexTask task2 = (HadoopIndexTask) jsonMapper.readValue(json, Task.class);
    Assert.assertEquals("foo", task.getDataSource());
    Assert.assertEquals(task.getId(), task2.getId());
    Assert.assertEquals(task.getGroupId(), task2.getGroupId());
    Assert.assertEquals(task.getDataSource(), task2.getDataSource());
    Assert.assertEquals(task.getSpec().getTuningConfig().getJobProperties(), task2.getSpec().getTuningConfig().getJobProperties());
    Assert.assertEquals("blah", task.getClasspathPrefix());
    Assert.assertEquals("blah", task2.getClasspathPrefix());
}
Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 4 with HadoopIngestionSpec

use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.

the class OrcIndexGeneratorJobTest method setUp.

@Before
public void setUp() throws Exception {
    mapper = HadoopDruidIndexerConfig.JSON_MAPPER;
    mapper.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, "hashed"));
    dataRoot = temporaryFolder.newFolder("data");
    outputRoot = temporaryFolder.newFolder("output");
    File dataFile = writeDataToLocalOrcFile(dataRoot, data);
    HashMap<String, Object> inputSpec = new HashMap<String, Object>();
    inputSpec.put("paths", dataFile.getCanonicalPath());
    inputSpec.put("type", "static");
    inputSpec.put("inputFormat", "org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat");
    config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema(dataSourceName, mapper.convertValue(inputRowParser, Map.class), aggs, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(this.interval)), mapper), new HadoopIOConfig(ImmutableMap.copyOf(inputSpec), null, outputRoot.getCanonicalPath()), new HadoopTuningConfig(outputRoot.getCanonicalPath(), null, null, null, null, null, false, false, false, false, //verifies that set num reducers is ignored
    ImmutableMap.of(JobContext.NUM_REDUCES, "0"), false, true, null, true, null, false, false)));
    config.setShardSpecs(loadShardSpecs(shardInfoForEachSegment));
    config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) HashMap(java.util.HashMap) NamedType(com.fasterxml.jackson.databind.jsontype.NamedType) HadoopTuningConfig(io.druid.indexer.HadoopTuningConfig) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Before(org.junit.Before)

Example 5 with HadoopIngestionSpec

use of io.druid.indexer.HadoopIngestionSpec in project druid by druid-io.

the class GranularityPathSpecTest method testIntervalTrimming.

@Test
public void testIntervalTrimming() throws Exception {
    UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
    HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.ALL, ImmutableList.of(new Interval("2015-01-01T11Z/2015-01-02T05Z"))), jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
    granularityPathSpec.setDataGranularity(Granularities.HOUR);
    granularityPathSpec.setPathFormat("yyyy/MM/dd/HH");
    granularityPathSpec.setFilePattern(".*");
    granularityPathSpec.setInputFormat(TextInputFormat.class);
    Job job = Job.getInstance();
    String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
    createFile(testFolder, "test/2015/01/01/00/file1", "test/2015/01/01/10/file2", "test/2015/01/01/18/file3", "test/2015/01/02/00/file1", "test/2015/01/02/03/file2", "test/2015/01/02/05/file3", "test/2015/01/02/07/file4", "test/2015/01/02/09/file5");
    granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
    granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
    String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
    String expected = Joiner.on(",").join(Lists.newArrayList(String.format(formatStr, testFolder.getRoot(), "test/2015/01/01/18/file3"), String.format(formatStr, testFolder.getRoot(), "test/2015/01/02/00/file1"), String.format(formatStr, testFolder.getRoot(), "test/2015/01/02/03/file2")));
    Assert.assertEquals("Did not find expected input paths", expected, actual);
}
Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) Job(org.apache.hadoop.mapreduce.Job) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Interval(org.joda.time.Interval) Test(org.junit.Test)

Aggregations

HadoopIngestionSpec (io.druid.indexer.HadoopIngestionSpec)7 HadoopIOConfig (io.druid.indexer.HadoopIOConfig)6 DataSchema (io.druid.segment.indexing.DataSchema)6 UniformGranularitySpec (io.druid.segment.indexing.granularity.UniformGranularitySpec)5 Interval (org.joda.time.Interval)5 Test (org.junit.Test)4 HadoopDruidIndexerConfig (io.druid.indexer.HadoopDruidIndexerConfig)3 Job (org.apache.hadoop.mapreduce.Job)3 HadoopTuningConfig (io.druid.indexer.HadoopTuningConfig)2 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)2 File (java.io.File)2 Before (org.junit.Before)2 NamedType (com.fasterxml.jackson.databind.jsontype.NamedType)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ByteSource (com.google.common.io.ByteSource)1 DelimitedParseSpec (io.druid.data.input.impl.DelimitedParseSpec)1 DimensionsSpec (io.druid.data.input.impl.DimensionsSpec)1 StringInputRowParser (io.druid.data.input.impl.StringInputRowParser)1 TimestampSpec (io.druid.data.input.impl.TimestampSpec)1