Search in sources :

Example 11 with UniformGranularitySpec

use of io.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.

the class IndexGeneratorCombinerTest method setUp.

@Before
public void setUp() throws Exception {
    HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema("website", HadoopDruidIndexerConfig.JSON_MAPPER.convertValue(new StringInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host", "keywords")), null, null)), null), Map.class), new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited"), new HyperUniquesAggregatorFactory("unique_hosts", "host") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(Interval.parse("2010/2011"))), HadoopDruidIndexerConfig.JSON_MAPPER), new HadoopIOConfig(ImmutableMap.<String, Object>of("paths", "/tmp/dummy", "type", "static"), null, "/tmp/dummy"), HadoopTuningConfig.makeDefaultTuningConfig().withWorkingPath("/tmp/work").withVersion("ver")));
    Configuration hadoopConfig = new Configuration();
    hadoopConfig.set(HadoopDruidIndexerConfig.CONFIG_PROPERTY, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(config));
    Reducer.Context context = EasyMock.createMock(Reducer.Context.class);
    EasyMock.expect(context.getConfiguration()).andReturn(hadoopConfig);
    EasyMock.replay(context);
    aggregators = config.getSchema().getDataSchema().getAggregators();
    combiner = new IndexGeneratorJob.IndexGeneratorCombiner();
    combiner.setup(context);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) DataSchema(io.druid.segment.indexing.DataSchema) TimeAndDimsParseSpec(io.druid.data.input.impl.TimeAndDimsParseSpec) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) StringInputRowParser(io.druid.data.input.impl.StringInputRowParser) TimestampSpec(io.druid.data.input.impl.TimestampSpec) HyperUniquesAggregatorFactory(io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) Reducer(org.apache.hadoop.mapreduce.Reducer) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Before(org.junit.Before)

Example 12 with UniformGranularitySpec

use of io.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.

the class GranularUnprocessedPathSpec method addInputPaths.

@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
    // This PathSpec breaks so many abstractions that we might as break some more
    Preconditions.checkState(config.getGranularitySpec() instanceof UniformGranularitySpec, String.format("Cannot use %s without %s", GranularUnprocessedPathSpec.class.getSimpleName(), UniformGranularitySpec.class.getSimpleName()));
    final Path betaInput = new Path(getInputPath());
    final FileSystem fs = betaInput.getFileSystem(job.getConfiguration());
    final Granularity segmentGranularity = config.getGranularitySpec().getSegmentGranularity();
    Map<Long, Long> inputModifiedTimes = new TreeMap<>(Comparators.inverse(Comparators.comparable()));
    for (FileStatus status : FSSpideringIterator.spiderIterable(fs, betaInput)) {
        final DateTime key = segmentGranularity.toDate(status.getPath().toString());
        final Long currVal = inputModifiedTimes.get(key);
        final long mTime = status.getModificationTime();
        inputModifiedTimes.put(key.getMillis(), currVal == null ? mTime : Math.max(currVal, mTime));
    }
    Set<Interval> bucketsToRun = Sets.newTreeSet(Comparators.intervals());
    for (Map.Entry<Long, Long> entry : inputModifiedTimes.entrySet()) {
        DateTime timeBucket = new DateTime(entry.getKey());
        long mTime = entry.getValue();
        String bucketOutput = String.format("%s/%s", config.getSchema().getIOConfig().getSegmentOutputPath(), segmentGranularity.toPath(timeBucket));
        for (FileStatus fileStatus : FSSpideringIterator.spiderIterable(fs, new Path(bucketOutput))) {
            if (fileStatus.getModificationTime() > mTime) {
                bucketsToRun.add(new Interval(timeBucket, segmentGranularity.increment(timeBucket)));
                break;
            }
        }
        if (bucketsToRun.size() >= maxBuckets) {
            break;
        }
    }
    config.setGranularitySpec(new UniformGranularitySpec(segmentGranularity, config.getGranularitySpec().getQueryGranularity(), config.getGranularitySpec().isRollup(), Lists.newArrayList(bucketsToRun)));
    return super.addInputPaths(config, job);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Granularity(io.druid.java.util.common.granularity.Granularity) TreeMap(java.util.TreeMap) DateTime(org.joda.time.DateTime) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) FileSystem(org.apache.hadoop.fs.FileSystem) TreeMap(java.util.TreeMap) Map(java.util.Map) Interval(org.joda.time.Interval)

Example 13 with UniformGranularitySpec

use of io.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.

the class BatchDeltaIngestionTest method makeHadoopDruidIndexerConfig.

private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map<String, Object> inputSpec, File tmpDir) throws Exception {
    HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema("website", MAPPER.convertValue(new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, ImmutableList.of("timestamp", "host", "host2", "visited_num")), null), Map.class), new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited_num"), new HyperUniquesAggregatorFactory("unique_hosts", "host2") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(INTERVAL_FULL)), MAPPER), new HadoopIOConfig(inputSpec, null, tmpDir.getCanonicalPath()), new HadoopTuningConfig(tmpDir.getCanonicalPath(), null, null, null, null, null, false, false, false, false, null, false, false, null, null, null, false, false)));
    config.setShardSpecs(ImmutableMap.<Long, List<HadoopyShardSpec>>of(INTERVAL_FULL.getStartMillis(), ImmutableList.of(new HadoopyShardSpec(new HashBasedNumberedShardSpec(0, 1, null, HadoopDruidIndexerConfig.JSON_MAPPER), 0))));
    config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
    return config;
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) CSVParseSpec(io.druid.data.input.impl.CSVParseSpec) StringInputRowParser(io.druid.data.input.impl.StringInputRowParser) TimestampSpec(io.druid.data.input.impl.TimestampSpec) HyperUniquesAggregatorFactory(io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 14 with UniformGranularitySpec

use of io.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.

the class HadoopIngestionSpecTest method testPeriodSegmentGranularitySpec.

@Test
public void testPeriodSegmentGranularitySpec() {
    final HadoopIngestionSpec schema;
    try {
        schema = jsonReadWriteRead("{\n" + "    \"dataSchema\": {\n" + "     \"dataSource\": \"foo\",\n" + "     \"metricsSpec\": [],\n" + "        \"granularitySpec\": {\n" + "                \"type\": \"uniform\",\n" + "                \"segmentGranularity\": {\"type\": \"period\", \"period\":\"PT1H\", \"timeZone\":\"America/Los_Angeles\"},\n" + "                \"intervals\": [\"2012-01-01/P1D\"]\n" + "        }\n" + "    }\n" + "}", HadoopIngestionSpec.class);
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
    final UniformGranularitySpec granularitySpec = (UniformGranularitySpec) schema.getDataSchema().getGranularitySpec();
    Assert.assertEquals("getSegmentGranularity", new PeriodGranularity(new Period("PT1H"), null, DateTimeZone.forID("America/Los_Angeles")), granularitySpec.getSegmentGranularity());
}
Also used : UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) PeriodGranularity(io.druid.java.util.common.granularity.PeriodGranularity) Period(org.joda.time.Period) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) Test(org.junit.Test)

Example 15 with UniformGranularitySpec

use of io.druid.segment.indexing.granularity.UniformGranularitySpec in project druid by druid-io.

the class GranularityPathSpecTest method testAddInputPath.

@Test
public void testAddInputPath() throws Exception {
    UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
    HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, ImmutableList.of(new Interval("2015-11-06T00:00Z/2015-11-07T00:00Z"))), jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
    granularityPathSpec.setDataGranularity(Granularities.HOUR);
    granularityPathSpec.setFilePattern(".*");
    granularityPathSpec.setInputFormat(TextInputFormat.class);
    Job job = Job.getInstance();
    String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=00");
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=02");
    testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=05");
    testFolder.newFile("test/y=2015/m=11/d=06/H=00/file1");
    testFolder.newFile("test/y=2015/m=11/d=06/H=02/file2");
    testFolder.newFile("test/y=2015/m=11/d=06/H=05/file3");
    testFolder.newFile("test/y=2015/m=11/d=06/H=05/file4");
    granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
    granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
    String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
    String expected = Joiner.on(",").join(Lists.newArrayList(String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=00/file1"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=02/file2"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file3"), String.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file4")));
    Assert.assertEquals("Did not find expected input paths", expected, actual);
}
Also used : HadoopIngestionSpec(io.druid.indexer.HadoopIngestionSpec) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) Job(org.apache.hadoop.mapreduce.Job) HadoopIOConfig(io.druid.indexer.HadoopIOConfig) Interval(org.joda.time.Interval) Test(org.junit.Test)

Aggregations

UniformGranularitySpec (io.druid.segment.indexing.granularity.UniformGranularitySpec)33 DataSchema (io.druid.segment.indexing.DataSchema)27 Interval (org.joda.time.Interval)21 Test (org.junit.Test)19 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)9 File (java.io.File)9 DimensionsSpec (io.druid.data.input.impl.DimensionsSpec)8 TimestampSpec (io.druid.data.input.impl.TimestampSpec)8 RealtimeTuningConfig (io.druid.segment.indexing.RealtimeTuningConfig)8 StringInputRowParser (io.druid.data.input.impl.StringInputRowParser)7 DefaultObjectMapper (io.druid.jackson.DefaultObjectMapper)7 CountAggregatorFactory (io.druid.query.aggregation.CountAggregatorFactory)7 LongSumAggregatorFactory (io.druid.query.aggregation.LongSumAggregatorFactory)7 Period (org.joda.time.Period)7 Before (org.junit.Before)7 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)6 DoubleSumAggregatorFactory (io.druid.query.aggregation.DoubleSumAggregatorFactory)6 RealtimeIOConfig (io.druid.segment.indexing.RealtimeIOConfig)6 DateTime (org.joda.time.DateTime)6 ImmutableMap (com.google.common.collect.ImmutableMap)5