Search in sources :

Example 16 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class WikiParquetInputTest method testWiki.

@Test
public void testWiki() throws IOException, InterruptedException {
    HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/wiki/wiki.json", parserType, false);
    config.intoConfiguration(job);
    Object data = getFirstRow(job, parserType, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals("Gypsy Danger", rows.get(0).getDimension("page").get(0));
    String s1 = rows.get(0).getDimension("language").get(0);
    String s2 = rows.get(0).getDimension("language").get(1);
    Assert.assertEquals("en", s1);
    Assert.assertEquals("zh", s2);
}
Also used : InputRow(org.apache.druid.data.input.InputRow) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Test(org.junit.Test)

Example 17 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class StaticPathSpecTest method testAddingPaths.

@Test
public void testAddingPaths() throws Exception {
    Job job = new Job();
    StaticPathSpec pathSpec = new StaticPathSpec("/a/c,/a/b/{c,d}", null);
    DataSchema schema = new DataSchema("ds", null, new AggregatorFactory[0], null, null, jsonMapper);
    HadoopIOConfig io = new HadoopIOConfig(null, null, null);
    pathSpec.addInputPaths(new HadoopDruidIndexerConfig(new HadoopIngestionSpec(schema, io, null)), job);
    String paths = job.getConfiguration().get(MultipleInputs.DIR_FORMATS);
    String formatter = TextInputFormat.class.getName();
    String[] expected = { "/a/c;" + formatter, "/a/b/c;" + formatter, "/a/b/d;" + formatter };
    Assert.assertArrayEquals(expected, paths.split(","));
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) HadoopIngestionSpec(org.apache.druid.indexer.HadoopIngestionSpec) Job(org.apache.hadoop.mapreduce.Job) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) HadoopIOConfig(org.apache.druid.indexer.HadoopIOConfig) Test(org.junit.Test)

Example 18 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class HdfsDataSegmentPusherTest method shouldMakeDefaultSegmentOutputPathIfNotHDFS.

@Test
public void shouldMakeDefaultSegmentOutputPathIfNotHDFS() {
    final HadoopIngestionSpec schema;
    try {
        schema = objectMapper.readValue("{\n" + "    \"dataSchema\": {\n" + "        \"dataSource\": \"the:data:source\",\n" + "        \"metricsSpec\": [],\n" + "        \"granularitySpec\": {\n" + "            \"type\": \"uniform\",\n" + "            \"segmentGranularity\": \"hour\",\n" + "            \"intervals\": [\"2012-07-10/P1D\"]\n" + "        }\n" + "    },\n" + "    \"ioConfig\": {\n" + "        \"type\": \"hadoop\",\n" + "        \"segmentOutputPath\": \"/tmp/dru:id/data:test\"\n" + "    }\n" + "}", HadoopIngestionSpec.class);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));
    Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30, ISOChronology.getInstanceUTC()), 4712);
    Path path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, 0), JobHelper.INDEX_ZIP, new LocalDataSegmentPusher(new LocalDataSegmentPusherConfig()));
    Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/index.zip", path.toString());
    path = JobHelper.makeTmpPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, 0), new TaskAttemptID("abc", 123, TaskType.REDUCE, 1, 0), new LocalDataSegmentPusher(new LocalDataSegmentPusherConfig()));
    Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/index.zip.0", path.toString());
}
Also used : HadoopIngestionSpec(org.apache.druid.indexer.HadoopIngestionSpec) Path(org.apache.hadoop.fs.Path) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) LocalDataSegmentPusherConfig(org.apache.druid.segment.loading.LocalDataSegmentPusherConfig) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) DataSegment(org.apache.druid.timeline.DataSegment) LocalDataSegmentPusher(org.apache.druid.segment.loading.LocalDataSegmentPusher) ExpectedException(org.junit.rules.ExpectedException) IOException(java.io.IOException) DateTime(org.joda.time.DateTime) Bucket(org.apache.druid.indexer.Bucket) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) Test(org.junit.Test)

Example 19 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class OrcHadoopInputRowParserTest method testOrcSplitElim.

@Test
public void testOrcSplitElim() throws IOException {
    // not sure what SplitElim means, but we'll test it!
    /*
      orc_split_elim.orc
      struct<userid:bigint,string1:string,subtype:double,decimal1:decimal(38,10),ts:timestamp>
      {2, foo, 0.8, 1.2, 1969-12-31 16:00:00.0}
     */
    HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/orc_split_elim_hadoop_job.json");
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals(4, rows.get(0).getDimensions().size());
    Assert.assertEquals("2", rows.get(0).getDimension("userid").get(0));
    Assert.assertEquals("foo", rows.get(0).getDimension("string1").get(0));
    Assert.assertEquals("0.8", rows.get(0).getDimension("subtype").get(0));
    Assert.assertEquals("1.2", rows.get(0).getDimension("decimal1").get(0));
    Assert.assertEquals(DateTimes.of("1969-12-31T16:00:00.0Z"), rows.get(0).getTimestamp());
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) InputRow(org.apache.druid.data.input.InputRow) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Example 20 with HadoopDruidIndexerConfig

use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.

the class OrcHadoopInputRowParserTest method testTest1.

@Test
public void testTest1() throws IOException {
    // total auto-discover fields (no flattenSpec, no dimensionSpec)
    HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/test_1_hadoop_job.json");
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);
    /*
      test_1.orc
      struct<timestamp:string,col1:string,col2:array<string>,val1:float>
      {2016-01-01T00:00:00.000Z, bar, [dat1, dat2, dat3], 1.1}
     */
    OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
    List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
    Assert.assertEquals(3, rows.get(0).getDimensions().size());
    Assert.assertEquals("bar", rows.get(0).getDimension("col1").get(0));
    String s1 = rows.get(0).getDimension("col2").get(0);
    String s2 = rows.get(0).getDimension("col2").get(1);
    String s3 = rows.get(0).getDimension("col2").get(2);
    Assert.assertEquals("dat1", s1);
    Assert.assertEquals("dat2", s2);
    Assert.assertEquals("dat3", s3);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) InputRow(org.apache.druid.data.input.InputRow) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Aggregations

HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)34 Test (org.junit.Test)32 InputRow (org.apache.druid.data.input.InputRow)27 ImmutableList (com.google.common.collect.ImmutableList)19 List (java.util.List)19 Job (org.apache.hadoop.mapreduce.Job)9 ArrayList (java.util.ArrayList)8 Configuration (org.apache.hadoop.conf.Configuration)8 OrcStruct (org.apache.orc.mapred.OrcStruct)6 BigDecimal (java.math.BigDecimal)3 HadoopIngestionSpec (org.apache.druid.indexer.HadoopIngestionSpec)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 DimensionSchema (org.apache.druid.data.input.impl.DimensionSchema)2 ParseSpec (org.apache.druid.data.input.impl.ParseSpec)2 Bucket (org.apache.druid.indexer.Bucket)2 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)2 DataSegment (org.apache.druid.timeline.DataSegment)2 NumberedShardSpec (org.apache.druid.timeline.partition.NumberedShardSpec)2 Path (org.apache.hadoop.fs.Path)2