use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class WikiParquetInputTest method testWiki.
@Test
public void testWiki() throws IOException, InterruptedException {
HadoopDruidIndexerConfig config = transformHadoopDruidIndexerConfig("example/wiki/wiki.json", parserType, false);
config.intoConfiguration(job);
Object data = getFirstRow(job, parserType, ((StaticPathSpec) config.getPathSpec()).getPaths());
List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
Assert.assertEquals("Gypsy Danger", rows.get(0).getDimension("page").get(0));
String s1 = rows.get(0).getDimension("language").get(0);
String s2 = rows.get(0).getDimension("language").get(1);
Assert.assertEquals("en", s1);
Assert.assertEquals("zh", s2);
}
use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class StaticPathSpecTest method testAddingPaths.
@Test
public void testAddingPaths() throws Exception {
Job job = new Job();
StaticPathSpec pathSpec = new StaticPathSpec("/a/c,/a/b/{c,d}", null);
DataSchema schema = new DataSchema("ds", null, new AggregatorFactory[0], null, null, jsonMapper);
HadoopIOConfig io = new HadoopIOConfig(null, null, null);
pathSpec.addInputPaths(new HadoopDruidIndexerConfig(new HadoopIngestionSpec(schema, io, null)), job);
String paths = job.getConfiguration().get(MultipleInputs.DIR_FORMATS);
String formatter = TextInputFormat.class.getName();
String[] expected = { "/a/c;" + formatter, "/a/b/c;" + formatter, "/a/b/d;" + formatter };
Assert.assertArrayEquals(expected, paths.split(","));
}
use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class HdfsDataSegmentPusherTest method shouldMakeDefaultSegmentOutputPathIfNotHDFS.
@Test
public void shouldMakeDefaultSegmentOutputPathIfNotHDFS() {
final HadoopIngestionSpec schema;
try {
schema = objectMapper.readValue("{\n" + " \"dataSchema\": {\n" + " \"dataSource\": \"the:data:source\",\n" + " \"metricsSpec\": [],\n" + " \"granularitySpec\": {\n" + " \"type\": \"uniform\",\n" + " \"segmentGranularity\": \"hour\",\n" + " \"intervals\": [\"2012-07-10/P1D\"]\n" + " }\n" + " },\n" + " \"ioConfig\": {\n" + " \"type\": \"hadoop\",\n" + " \"segmentOutputPath\": \"/tmp/dru:id/data:test\"\n" + " }\n" + "}", HadoopIngestionSpec.class);
} catch (Exception e) {
throw new RuntimeException(e);
}
HadoopDruidIndexerConfig cfg = new HadoopDruidIndexerConfig(schema.withTuningConfig(schema.getTuningConfig().withVersion("some:brand:new:version")));
Bucket bucket = new Bucket(4711, new DateTime(2012, 07, 10, 5, 30, ISOChronology.getInstanceUTC()), 4712);
Path path = JobHelper.makeFileNamePath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, 0), JobHelper.INDEX_ZIP, new LocalDataSegmentPusher(new LocalDataSegmentPusherConfig()));
Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/index.zip", path.toString());
path = JobHelper.makeTmpPath(new Path(cfg.getSchema().getIOConfig().getSegmentOutputPath()), new LocalFileSystem(), new DataSegment(cfg.getSchema().getDataSchema().getDataSource(), cfg.getSchema().getDataSchema().getGranularitySpec().bucketInterval(bucket.time).get(), cfg.getSchema().getTuningConfig().getVersion(), null, null, null, new NumberedShardSpec(bucket.partitionNum, 5000), -1, 0), new TaskAttemptID("abc", 123, TaskType.REDUCE, 1, 0), new LocalDataSegmentPusher(new LocalDataSegmentPusherConfig()));
Assert.assertEquals("file:/tmp/dru:id/data:test/the:data:source/2012-07-10T05:00:00.000Z_2012-07-10T06:00:00.000Z/some:brand:new:" + "version/4712/index.zip.0", path.toString());
}
use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class OrcHadoopInputRowParserTest method testOrcSplitElim.
@Test
public void testOrcSplitElim() throws IOException {
// not sure what SplitElim means, but we'll test it!
/*
orc_split_elim.orc
struct<userid:bigint,string1:string,subtype:double,decimal1:decimal(38,10),ts:timestamp>
{2, foo, 0.8, 1.2, 1969-12-31 16:00:00.0}
*/
HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/orc_split_elim_hadoop_job.json");
Job job = Job.getInstance(new Configuration());
config.intoConfiguration(job);
OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
Assert.assertEquals(4, rows.get(0).getDimensions().size());
Assert.assertEquals("2", rows.get(0).getDimension("userid").get(0));
Assert.assertEquals("foo", rows.get(0).getDimension("string1").get(0));
Assert.assertEquals("0.8", rows.get(0).getDimension("subtype").get(0));
Assert.assertEquals("1.2", rows.get(0).getDimension("decimal1").get(0));
Assert.assertEquals(DateTimes.of("1969-12-31T16:00:00.0Z"), rows.get(0).getTimestamp());
}
use of org.apache.druid.indexer.HadoopDruidIndexerConfig in project druid by druid-io.
the class OrcHadoopInputRowParserTest method testTest1.
@Test
public void testTest1() throws IOException {
// total auto-discover fields (no flattenSpec, no dimensionSpec)
HadoopDruidIndexerConfig config = loadHadoopDruidIndexerConfig("example/test_1_hadoop_job.json");
Job job = Job.getInstance(new Configuration());
config.intoConfiguration(job);
/*
test_1.orc
struct<timestamp:string,col1:string,col2:array<string>,val1:float>
{2016-01-01T00:00:00.000Z, bar, [dat1, dat2, dat3], 1.1}
*/
OrcStruct data = getFirstRow(job, ((StaticPathSpec) config.getPathSpec()).getPaths());
List<InputRow> rows = (List<InputRow>) config.getParser().parseBatch(data);
Assert.assertEquals(3, rows.get(0).getDimensions().size());
Assert.assertEquals("bar", rows.get(0).getDimension("col1").get(0));
String s1 = rows.get(0).getDimension("col2").get(0);
String s2 = rows.get(0).getDimension("col2").get(1);
String s3 = rows.get(0).getDimension("col2").get(2);
Assert.assertEquals("dat1", s1);
Assert.assertEquals("dat2", s2);
Assert.assertEquals("dat3", s3);
}
Aggregations