Search in sources :

Example 31 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class OrcHadoopInputRowParserTest method testTest2.

@Test
public void testTest2() throws IOException {
    HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/test_2_hadoop_job.json");
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    /*
      test_2.orc
      struct<timestamp:string,col1:string,col2:array<string>,col3:float,col4:bigint,col5:decimal,col6:array<string>,col7:map<string,string>>
      {2016-01-01, bar, [dat1, dat2, dat3], 1.1, 2, 3.5, [], {subcol7=subval7}}
     */
    OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals(7, rows.get(0).getDimensions().size());
    Assert.assertEquals("bar", rows.get(0).getDimension("col1").get(0));
    Assert.assertEquals("dat1", rows.get(0).getDimension("col2").get(0));
    Assert.assertEquals("dat2", rows.get(0).getDimension("col2").get(1));
    Assert.assertEquals("dat3", rows.get(0).getDimension("col2").get(2));
    Assert.assertEquals(1.1f, rows.get(0).getRaw("col3"));
    Assert.assertEquals(2L, rows.get(0).getRaw("col4"));
    Assert.assertEquals(3.5d, rows.get(0).getRaw("col5"));
    Assert.assertEquals(ImmutableList.of(), rows.get(0).getRaw("col6"));
    Assert.assertEquals("subval7", rows.get(0).getRaw("col7-subcol7"));
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) InputRow(org.apache.druid.data.input.InputRow) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Example 32 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class HdfsDataSegmentPusherTest method shouldMakeHDFSCompliantSegmentOutputPath.

@Test
public void shouldMakeHDFSCompliantSegmentOutputPath() {
    HadoopIngestionSpec schema;
    try {
        schema = objectMapper.readValue("{\n" + "    \"dataSchema\": {\n" + "        \"dataSource\": \"source\",\n" + "        \"metricsSpec\": [],\n" + "        \"granularitySpec\": {\n" + "            \"type\": \"uniform\",\n" + "            \"segmentGranularity\": \"hour\",\n" + "            \"intervals\": [\"2012-07-10/P1D\"]\n" + "        }\n" + "    },\n" + "    \"ioConfig\": {\n" + "        \"type\": \"hadoop\",\n" + "        \"segmentOutputPath\": \"hdfs://server:9100/tmp/druid/datatest\"\n" + "    }\n" + "}", HadoopIngestionSpec.class);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    // DataSchema dataSchema = new DataSchema("dataSource", null, null, Gra)
    // schema = new HadoopIngestionSpec(dataSchema, ioConfig, HadoopTuningConfig.makeDefaultTuningConfig());
    HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));
    Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30, ISOChronology.getInstanceUTC()), 4712);
    Path path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new DistributedFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, 0), JobHelper.INDEX_ZIP, hdfsDataSegmentPusher);
    Assert.assertEquals("hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version" + "/4712_index.zip", path.toString());
    path = JobHelper.makeTmpPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new DistributedFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, 0), new TaskAttemptID("abc", 123, TaskType.REDUCE, 1, 0), hdfsDataSegmentPusher);
    Assert.assertEquals("hdfs://server:9100/tmp/druid/datatest/source/20120710T050000.000Z_20120710T060000.000Z/some_brand_new_version" + "/4712_index.zip.0", path.toString());
}
Also used : HadoopIngestionSpec(org.apache.druid.indexer.HadoopIngestionSpec) Path(org.apache.hadoop.fs.Path) Bucket(org.apache.druid.indexer.Bucket) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) DataSegment(org.apache.druid.timeline.DataSegment) ExpectedException(org.junit.rules.ExpectedException) IOException(java.io.IOException) DateTime(org.joda.time.DateTime) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) Test(org.junit.Test)

Example 33 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DatasourcePathSpecTest method testAddInputPaths.

@Test
public void testAddInputPaths() throws Exception {
    HadoopDruidIndexerConfig hadoopIndexerConfig = makeHadoopDruidIndexerConfig();
    DatasourcePathSpec pathSpec1 = new DatasourcePathSpec(segments1, ingestionSpec1, null, false);
    DatasourcePathSpec pathSpec2 = new DatasourcePathSpec(segments2, ingestionSpec2, null, false);
    Configuration config = new Configuration();
    Job job = EasyMock.createNiceMock(Job.class);
    EasyMock.expect(job.getConfiguration()).andReturn(config).anyTimes();
    EasyMock.replay(job);
    pathSpec1.addInputPaths(hadoopIndexerConfig, job);
    pathSpec2.addInputPaths(hadoopIndexerConfig, job);
    Assert.assertEquals(ImmutableList.of(ingestionSpec1.getDataSource(), ingestionSpec2.getDataSource()), DatasourceInputFormat.getDataSources(config));
    Assert.assertEquals(segments1, DatasourceInputFormat.getSegments(config, ingestionSpec1.getDataSource()));
    Assert.assertEquals(segments2, DatasourceInputFormat.getSegments(config, ingestionSpec2.getDataSource()));
    Assert.assertEquals(ingestionSpec1.withDimensions(ImmutableList.of("product")).withMetrics(ImmutableList.of("visited_sum")), DatasourceInputFormat.getIngestionSpec(config, ingestionSpec1.getDataSource()));
    Assert.assertEquals(ingestionSpec2.withDimensions(ImmutableList.of("product2")).withMetrics(ImmutableList.of("visited_sum")), DatasourceInputFormat.getIngestionSpec(config, ingestionSpec2.getDataSource()));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Example 34 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class DatasourcePathSpecTest method testAddInputPathsWithNoSegments.

@Test
public void testAddInputPathsWithNoSegments() throws Exception {
    HadoopDruidIndexerConfig hadoopIndexerConfig = makeHadoopDruidIndexerConfig();
    DatasourcePathSpec pathSpec = new DatasourcePathSpec(null, ingestionSpec1, null, false);
    Configuration config = new Configuration();
    Job job = EasyMock.createNiceMock(Job.class);
    EasyMock.expect(job.getConfiguration()).andReturn(config).anyTimes();
    EasyMock.replay(job);
    try {
        pathSpec.addInputPaths(hadoopIndexerConfig, job);
        Assert.fail("should've been ISE");
    } catch (ISE ex) {
    // OK
    }
    // now with ignoreWhenNoSegments flag set
    pathSpec = new DatasourcePathSpec(null, ingestionSpec1.withIgnoreWhenNoSegments(true), null, false);
    pathSpec.addInputPaths(hadoopIndexerConfig, job);
    Assert.assertEquals(Collections.emptyList(), DatasourceInputFormat.getDataSources(config));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ISE(org.apache.druid.java.util.common.ISE) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Aggregations

HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)34 Test (org.junit.Test)32 InputRow (org.apache.druid.data.input.InputRow)27 ImmutableList (com.google.common.collect.ImmutableList)19 List (java.util.List)19 Job (org.apache.hadoop.mapreduce.Job)9 ArrayList (java.util.ArrayList)8 Configuration (org.apache.hadoop.conf.Configuration)8 OrcStruct (org.apache.orc.mapred.OrcStruct)6 BigDecimal (java.math.BigDecimal)3 HadoopIngestionSpec (org.apache.druid.indexer.HadoopIngestionSpec)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 DimensionSchema (org.apache.druid.data.input.impl.DimensionSchema)2 ParseSpec (org.apache.druid.data.input.impl.ParseSpec)2 Bucket (org.apache.druid.indexer.Bucket)2 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)2 DataSegment (org.apache.druid.timeline.DataSegment)2 NumberedShardSpec (org.apache.druid.timeline.partition.NumberedShardSpec)2 Path (org.apache.hadoop.fs.Path)2