use of io.druid.java.util.common.granularity.Granularity in project druid by druid-io.
the class AppenderatorPlumber method startPersistThread.
private void startPersistThread() {
final Granularity segmentGranularity = schema.getGranularitySpec().getSegmentGranularity();
final Period windowPeriod = config.getWindowPeriod();
final DateTime truncatedNow = segmentGranularity.bucketStart(new DateTime());
final long windowMillis = windowPeriod.toStandardDuration().getMillis();
log.info("Expect to run at [%s]", new DateTime().plus(new Duration(System.currentTimeMillis(), segmentGranularity.increment(truncatedNow).getMillis() + windowMillis)));
ScheduledExecutors.scheduleAtFixedRate(scheduledExecutor, new Duration(System.currentTimeMillis(), segmentGranularity.increment(truncatedNow).getMillis() + windowMillis), new Duration(truncatedNow, segmentGranularity.increment(truncatedNow)), new ThreadRenamingCallable<ScheduledExecutors.Signal>(String.format("%s-overseer-%d", schema.getDataSource(), config.getShardSpec().getPartitionNum())) {
@Override
public ScheduledExecutors.Signal doCall() {
if (stopped) {
log.info("Stopping merge-n-push overseer thread");
return ScheduledExecutors.Signal.STOP;
}
mergeAndPush();
if (stopped) {
log.info("Stopping merge-n-push overseer thread");
return ScheduledExecutors.Signal.STOP;
} else {
return ScheduledExecutors.Signal.REPEAT;
}
}
});
}
use of io.druid.java.util.common.granularity.Granularity in project druid by druid-io.
the class RealtimePlumber method startPersistThread.
protected void startPersistThread() {
final Granularity segmentGranularity = schema.getGranularitySpec().getSegmentGranularity();
final Period windowPeriod = config.getWindowPeriod();
final DateTime truncatedNow = segmentGranularity.bucketStart(new DateTime());
final long windowMillis = windowPeriod.toStandardDuration().getMillis();
log.info("Expect to run at [%s]", new DateTime().plus(new Duration(System.currentTimeMillis(), segmentGranularity.increment(truncatedNow).getMillis() + windowMillis)));
ScheduledExecutors.scheduleAtFixedRate(scheduledExecutor, new Duration(System.currentTimeMillis(), segmentGranularity.increment(truncatedNow).getMillis() + windowMillis), new Duration(truncatedNow, segmentGranularity.increment(truncatedNow)), new ThreadRenamingCallable<ScheduledExecutors.Signal>(String.format("%s-overseer-%d", schema.getDataSource(), config.getShardSpec().getPartitionNum())) {
@Override
public ScheduledExecutors.Signal doCall() {
if (stopped) {
log.info("Stopping merge-n-push overseer thread");
return ScheduledExecutors.Signal.STOP;
}
mergeAndPush();
if (stopped) {
log.info("Stopping merge-n-push overseer thread");
return ScheduledExecutors.Signal.STOP;
} else {
return ScheduledExecutors.Signal.REPEAT;
}
}
});
}
use of io.druid.java.util.common.granularity.Granularity in project druid by druid-io.
the class IndexTask method determineShardSpecs.
/**
* Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
* hash-based partitioning). In the future we may want to also support single-dimension partitioning.
*/
private Map<Interval, List<ShardSpec>> determineShardSpecs(final TaskToolbox toolbox, final FirehoseFactory firehoseFactory) throws IOException {
final ObjectMapper jsonMapper = toolbox.getObjectMapper();
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
final boolean determineIntervals = !ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
final Map<Interval, List<ShardSpec>> shardSpecs = Maps.newHashMap();
// if we were given number of shards per interval and the intervals, we don't need to scan the data
if (!determineNumPartitions && !determineIntervals) {
log.info("numShards and intervals provided, skipping determine partition scan");
final SortedSet<Interval> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().get();
final int numShards = ingestionSchema.getTuningConfig().getNumShards();
for (Interval interval : intervals) {
final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
return shardSpecs;
}
// determine intervals containing data and prime HLL collectors
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = Maps.newHashMap();
int thrownAway = 0;
log.info("Determining intervals and shardSpecs");
long determineShardSpecsStartMillis = System.currentTimeMillis();
try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
while (firehose.hasMore()) {
final InputRow inputRow = firehose.nextRow();
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
thrownAway++;
continue;
}
interval = optInterval.get();
}
if (!determineNumPartitions) {
// for the interval and don't instantiate a HLL collector
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.<HyperLogLogCollector>absent());
}
continue;
}
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
}
List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
hllCollectors.get(interval).get().add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
}
}
if (thrownAway > 0) {
log.warn("Unable to to find a matching interval for [%,d] events", thrownAway);
}
final ImmutableSortedMap<Interval, Optional<HyperLogLogCollector>> sortedMap = ImmutableSortedMap.copyOf(hllCollectors, Comparators.intervalsByStartThenEnd());
for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : sortedMap.entrySet()) {
final Interval interval = entry.getKey();
final Optional<HyperLogLogCollector> collector = entry.getValue();
final int numShards;
if (determineNumPartitions) {
final long numRows = new Double(collector.get().estimateCardinality()).longValue();
numShards = (int) Math.ceil((double) numRows / ingestionSchema.getTuningConfig().getTargetPartitionSize());
log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
} else {
numShards = ingestionSchema.getTuningConfig().getNumShards();
log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
}
final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return shardSpecs;
}
use of io.druid.java.util.common.granularity.Granularity in project druid by druid-io.
the class GranularUnprocessedPathSpec method addInputPaths.
@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
// This PathSpec breaks so many abstractions that we might as break some more
Preconditions.checkState(config.getGranularitySpec() instanceof UniformGranularitySpec, String.format("Cannot use %s without %s", GranularUnprocessedPathSpec.class.getSimpleName(), UniformGranularitySpec.class.getSimpleName()));
final Path betaInput = new Path(getInputPath());
final FileSystem fs = betaInput.getFileSystem(job.getConfiguration());
final Granularity segmentGranularity = config.getGranularitySpec().getSegmentGranularity();
Map<Long, Long> inputModifiedTimes = new TreeMap<>(Comparators.inverse(Comparators.comparable()));
for (FileStatus status : FSSpideringIterator.spiderIterable(fs, betaInput)) {
final DateTime key = segmentGranularity.toDate(status.getPath().toString());
final Long currVal = inputModifiedTimes.get(key);
final long mTime = status.getModificationTime();
inputModifiedTimes.put(key.getMillis(), currVal == null ? mTime : Math.max(currVal, mTime));
}
Set<Interval> bucketsToRun = Sets.newTreeSet(Comparators.intervals());
for (Map.Entry<Long, Long> entry : inputModifiedTimes.entrySet()) {
DateTime timeBucket = new DateTime(entry.getKey());
long mTime = entry.getValue();
String bucketOutput = String.format("%s/%s", config.getSchema().getIOConfig().getSegmentOutputPath(), segmentGranularity.toPath(timeBucket));
for (FileStatus fileStatus : FSSpideringIterator.spiderIterable(fs, new Path(bucketOutput))) {
if (fileStatus.getModificationTime() > mTime) {
bucketsToRun.add(new Interval(timeBucket, segmentGranularity.increment(timeBucket)));
break;
}
}
if (bucketsToRun.size() >= maxBuckets) {
break;
}
}
config.setGranularitySpec(new UniformGranularitySpec(segmentGranularity, config.getGranularitySpec().getQueryGranularity(), config.getGranularitySpec().isRollup(), Lists.newArrayList(bucketsToRun)));
return super.addInputPaths(config, job);
}
use of io.druid.java.util.common.granularity.Granularity in project druid by druid-io.
the class GranularityPathSpecTest method testBackwardCompatiblePeriodSegmentGranularitySerialization.
@Test
public void testBackwardCompatiblePeriodSegmentGranularitySerialization() throws JsonProcessingException {
final PeriodGranularity pt2S = new PeriodGranularity(new Period("PT2S"), null, DateTimeZone.UTC);
Assert.assertNotEquals("\"SECOND\"", jsonMapper.writeValueAsString(pt2S));
final Granularity pt1S = Granularities.SECOND;
Assert.assertEquals("\"SECOND\"", jsonMapper.writeValueAsString(pt1S));
}
Aggregations