Examples with HadoopIOConfig - io.druid.indexer.HadoopIOConfig

Example 1 with HadoopIOConfig

use of io.druid.indexer.HadoopIOConfig in project druid by druid-io.

the class GranularityPathSpecTest method testAddInputPath.

@Test
public void testAddInputPath() throws Exception {
    UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
    HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, ImmutableList.of(new Interval("2015-11-06T00:00Z/2015-11-07T00:00Z"))), jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
    granularityPathSpec.setDataGranularity(Granularities.HOUR);
    granularityPathSpec.setFilePattern(".*");
    granularityPathSpec.setInputFormat(TextInputFormat.class);
    Job job = Job.getInstance();
    String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=00");
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=02");
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=05");
    testFolder.newFile("test/y=2015/m=11/d=06/H=00/file1");
    testFolder.newFile("test/y=2015/m=11/d=06/H=02/file2");
    testFolder.newFile("test/y=2015/m=11/d=06/H=05/file3");
    testFolder.newFile("test/y=2015/m=11/d=06/H=05/file4");
    granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
    granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
    String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
    String expected = Joiner.on(",").join(Lists.newArrayList(String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=00/file1"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=02/file2"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file3"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file4")));
    Assert.assertEquals("Did not find expected input paths", expected, actual);
}

Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) Job(org.apache.hadoop.mapreduce.Job) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 2 with HadoopIOConfig

use of io.druid.indexer.HadoopIOConfig in project druid by druid-io.

the class TaskSerdeTest method testHadoopIndexTaskSerde.

@Test
public void testHadoopIndexTaskSerde() throws Exception {
    final HadoopIndexTask task = new HadoopIndexTask(null, new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, null, ImmutableList.of(new Interval("2010-01-01/P1D"))), jsonMapper), new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "bar"), null, null), null), null, null, "blah", jsonMapper, null);
    final String json = jsonMapper.writeValueAsString(task);
    final HadoopIndexTask task2 = (HadoopIndexTask) jsonMapper.readValue(json, Task.class);
    Assert.assertEquals("foo", task.getDataSource());
    Assert.assertEquals(task.getId(), task2.getId());
    Assert.assertEquals(task.getGroupId(), task2.getGroupId());
    Assert.assertEquals(task.getDataSource(), task2.getDataSource());
    Assert.assertEquals(task.getSpec().getTuningConfig().getJobProperties(), task2.getSpec().getTuningConfig().getJobProperties());
    Assert.assertEquals("blah", task.getClasspathPrefix());
    Assert.assertEquals("blah", task2.getClasspathPrefix());
}

Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 3 with HadoopIOConfig

use of io.druid.indexer.HadoopIOConfig in project druid by druid-io.

the class OrcIndexGeneratorJobTest method setUp.

@Before
public void setUp() throws Exception {
    mapper = HadoopDruidIndexerConfig.JSON_MAPPER;
    mapper.registerSubtypes(new NamedType(HashBasedNumberedShardSpec.class, "hashed"));
    dataRoot = temporaryFolder.newFolder("data");
    outputRoot = temporaryFolder.newFolder("output");
    File dataFile = writeDataToLocalOrcFile(dataRoot, data);
    HashMap<String, Object> inputSpec = new HashMap<String, Object>();
    inputSpec.put("paths", dataFile.getCanonicalPath());
    inputSpec.put("type", "static");
    inputSpec.put("inputFormat", "org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat");
    config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema(dataSourceName, mapper.convertValue(inputRowParser, Map.class), aggs, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(this.interval)), mapper), new HadoopIOConfig(ImmutableMap.copyOf(inputSpec), null, outputRoot.getCanonicalPath()), new HadoopTuningConfig(outputRoot.getCanonicalPath(), null, null, null, null, null, false, false, false, false, //verifies that set num reducers is ignored
    ImmutableMap.of(JobContext.NUM_REDUCES, "0"), false, true, null, true, null, false, false)));
    config.setShardSpecs(loadShardSpecs(shardInfoForEachSegment));
    config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
}

Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) HashMap(java.util.HashMap) NamedType(com.fasterxml.jackson.databind.jsontype.NamedType) HadoopTuningConfig(io.druid.indexer.HadoopTuningConfig) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Before(org.junit.Before)

Example 4 with HadoopIOConfig

use of io.druid.indexer.HadoopIOConfig in project druid by druid-io.

the class GranularityPathSpecTest method testIntervalTrimming.

@Test
public void testIntervalTrimming() throws Exception {
    UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
    HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.ALL, ImmutableList.of(new Interval("2015-01-01T11Z/2015-01-02T05Z"))), jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
    granularityPathSpec.setDataGranularity(Granularities.HOUR);
    granularityPathSpec.setPathFormat("yyyy/MM/dd/HH");
    granularityPathSpec.setFilePattern(".*");
    granularityPathSpec.setInputFormat(TextInputFormat.class);
    Job job = Job.getInstance();
    String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
    createFile(testFolder, "test/2015/01/01/00/file1", "test/2015/01/01/10/file2", "test/2015/01/01/18/file3", "test/2015/01/02/00/file1", "test/2015/01/02/03/file2", "test/2015/01/02/05/file3", "test/2015/01/02/07/file4", "test/2015/01/02/09/file5");
    granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
    granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
    String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
    String expected = Joiner.on(",").join(Lists.newArrayList(String.format(formatStr, testFolder.getRoot(), "test/2015/01/01/18/file3"), String.format(formatStr, testFolder.getRoot(), "test/2015/01/02/00/file1"), String.format(formatStr, testFolder.getRoot(), "test/2015/01/02/03/file2")));
    Assert.assertEquals("Did not find expected input paths", expected, actual);
}

Example 5 with HadoopIOConfig

use of io.druid.indexer.HadoopIOConfig in project druid by druid-io.

the class StaticPathSpecTest method testAddingPaths.

@Test
public void testAddingPaths() throws Exception {
    Job job = new Job();
    StaticPathSpec pathSpec = new StaticPathSpec("/a/c,/a/b/{c,d}", null);
    DataSchema schema = new DataSchema("ds", null, new AggregatorFactory[0], null, jsonMapper);
    HadoopIOConfig io = new HadoopIOConfig(null, null, null);
    pathSpec.addInputPaths(new HadoopDruidIndexerConfig(new HadoopIngestionSpec(schema, io, null)), job);
    String paths = job.getConfiguration().get(MultipleInputs.DIR_FORMATS);
    String formatter = TextInputFormat.class.getName();
    String[] expected = { "/a/c;" + formatter, "/a/b/c;" + formatter, "/a/b/d;" + formatter };
    Assert.assertArrayEquals(expected, paths.split(","));
}

Also used : DataSchema(io.druid.segment.indexing.DataSchema) HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) Job(org.apache.hadoop.mapreduce.Job) HadoopDruidIndexerConfig(io.druid.indexer.HadoopDruidIndexerConfig) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Test(org.junit.Test)

Aggregations

HadoopIOConfig (io.druid.indexer.HadoopIOConfig)6 HadoopIngestionSpec (io.druid.indexer.HadoopIngestionSpec)6 DataSchema (io.druid.segment.indexing.DataSchema)6 UniformGranularitySpec (io.druid.segment.indexing.granularity.UniformGranularitySpec)5 Interval (org.joda.time.Interval)4 Test (org.junit.Test)4 HadoopDruidIndexerConfig (io.druid.indexer.HadoopDruidIndexerConfig)3 Job (org.apache.hadoop.mapreduce.Job)3 HadoopTuningConfig (io.druid.indexer.HadoopTuningConfig)2 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)2 File (java.io.File)2 Before (org.junit.Before)2 NamedType (com.fasterxml.jackson.databind.jsontype.NamedType)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ByteSource (com.google.common.io.ByteSource)1 DelimitedParseSpec (io.druid.data.input.impl.DelimitedParseSpec)1 DimensionsSpec (io.druid.data.input.impl.DimensionsSpec)1 StringInputRowParser (io.druid.data.input.impl.StringInputRowParser)1 TimestampSpec (io.druid.data.input.impl.TimestampSpec)1 HadoopDruidDetermineConfigurationJob (io.druid.indexer.HadoopDruidDetermineConfigurationJob)1