use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class IndexTaskTest method runTask.
private final List<DataSegment> runTask(final IndexTask indexTask) throws Exception {
final List<DataSegment> segments = Lists.newArrayList();
indexTask.run(new TaskToolbox(null, null, new TaskActionClient() {
@Override
public <RetType> RetType submit(TaskAction<RetType> taskAction) throws IOException {
if (taskAction instanceof LockListAction) {
return (RetType) Arrays.asList(new TaskLock("", "", null, new DateTime().toString()));
}
if (taskAction instanceof LockAcquireAction) {
return (RetType) new TaskLock("groupId", "test", ((LockAcquireAction) taskAction).getInterval(), new DateTime().toString());
}
if (taskAction instanceof SegmentTransactionalInsertAction) {
return (RetType) new SegmentPublishResult(((SegmentTransactionalInsertAction) taskAction).getSegments(), true);
}
if (taskAction instanceof SegmentAllocateAction) {
SegmentAllocateAction action = (SegmentAllocateAction) taskAction;
Interval interval = action.getPreferredSegmentGranularity().bucket(action.getTimestamp());
ShardSpec shardSpec = new NumberedShardSpec(segmentAllocatePartitionCounter++, 0);
return (RetType) new SegmentIdentifier(action.getDataSource(), interval, "latestVersion", shardSpec);
}
return null;
}
}, null, new DataSegmentPusher() {
@Deprecated
@Override
public String getPathForHadoop(String dataSource) {
return getPathForHadoop();
}
@Override
public String getPathForHadoop() {
return null;
}
@Override
public DataSegment push(File file, DataSegment segment) throws IOException {
segments.add(segment);
return segment;
}
}, null, null, null, null, null, null, null, null, null, jsonMapper, temporaryFolder.newFolder(), indexMerger, indexIO, null, null, indexMergerV9));
Collections.sort(segments);
return segments;
}
use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class DeterminePartitionsJob method run.
public boolean run() {
try {
if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
throw new ISE("DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec());
}
if (!config.getPartitionsSpec().isAssumeGrouped()) {
final Job groupByJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));
JobHelper.injectSystemProperties(groupByJob);
config.addJobProperties(groupByJob);
groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
groupByJob.setMapOutputKeyClass(BytesWritable.class);
groupByJob.setMapOutputValueClass(NullWritable.class);
groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
groupByJob.setOutputKeyClass(BytesWritable.class);
groupByJob.setOutputValueClass(NullWritable.class);
groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
config.addInputPaths(groupByJob);
config.intoConfiguration(groupByJob);
FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
groupByJob.submit();
log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
if (!groupByJob.waitForCompletion(true)) {
log.error("Job failed: %s", groupByJob.getJobID());
return false;
}
} else {
log.info("Skipping group-by job.");
}
/*
* Read grouped data and determine appropriate partitions.
*/
final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));
dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
JobHelper.injectSystemProperties(dimSelectionJob);
config.addJobProperties(dimSelectionJob);
if (!config.getPartitionsSpec().isAssumeGrouped()) {
// Read grouped data from the groupByJob.
dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
} else {
// Directly read the source data, since we assume it's already grouped.
dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
config.addInputPaths(dimSelectionJob);
}
SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
dimSelectionJob.setMapOutputValueClass(Text.class);
dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
dimSelectionJob.setOutputKeyClass(BytesWritable.class);
dimSelectionJob.setOutputValueClass(Text.class);
dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);
config.intoConfiguration(dimSelectionJob);
FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
dimSelectionJob.submit();
log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());
if (!dimSelectionJob.waitForCompletion(true)) {
log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
return false;
}
/*
* Load partitions determined by the previous job.
*/
log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
FileSystem fileSystem = null;
Map<Long, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap();
int shardCount = 0;
for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
if (fileSystem == null) {
fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
}
if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
List<ShardSpec> specs = config.JSON_MAPPER.readValue(Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {
});
List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
for (int i = 0; i < specs.size(); ++i) {
actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
}
shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
} else {
log.info("Path[%s] didn't exist!?", partitionInfoPath);
}
}
config.setShardSpecs(shardSpecs);
return true;
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class SchemalessIndexTest method makeAppendedMMappedIndex.
private static QueryableIndex makeAppendedMMappedIndex(Iterable<Pair<String, AggregatorFactory[]>> files, final List<Interval> intervals) {
try {
File tmpFile = File.createTempFile("yay", "boo");
tmpFile.delete();
File mergedFile = new File(tmpFile, "merged");
mergedFile.mkdirs();
mergedFile.deleteOnExit();
List<File> filesToMap = makeFilesToMap(tmpFile, files);
VersionedIntervalTimeline<Integer, File> timeline = new VersionedIntervalTimeline<Integer, File>(Ordering.natural().nullsFirst());
ShardSpec noneShardSpec = NoneShardSpec.instance();
for (int i = 0; i < intervals.size(); i++) {
timeline.add(intervals.get(i), i, noneShardSpec.createChunk(filesToMap.get(i)));
}
final List<IndexableAdapter> adapters = Lists.newArrayList(Iterables.concat(// TimelineObjectHolder is actually an iterable of iterable of indexable adapters
Iterables.transform(timeline.lookup(new Interval("1000-01-01/3000-01-01")), new Function<TimelineObjectHolder<Integer, File>, Iterable<IndexableAdapter>>() {
@Override
public Iterable<IndexableAdapter> apply(final TimelineObjectHolder<Integer, File> timelineObjectHolder) {
return Iterables.transform(timelineObjectHolder.getObject(), // Each chunk can be used to build the actual IndexableAdapter
new Function<PartitionChunk<File>, IndexableAdapter>() {
@Override
public IndexableAdapter apply(PartitionChunk<File> chunk) {
try {
return new RowboatFilteringIndexAdapter(new QueryableIndexIndexableAdapter(INDEX_IO.loadIndex(chunk.getObject())), new Predicate<Rowboat>() {
@Override
public boolean apply(Rowboat input) {
return timelineObjectHolder.getInterval().contains(input.getTimestamp());
}
});
} catch (IOException e) {
throw Throwables.propagate(e);
}
}
});
}
})));
return INDEX_IO.loadIndex(INDEX_MERGER.append(adapters, null, mergedFile, indexSpec));
} catch (IOException e) {
throw Throwables.propagate(e);
}
}
use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class IndexTask method generateAndPublishSegments.
private boolean generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final Map<Interval, List<ShardSpec>> shardSpecs, final String version, final FirehoseFactory firehoseFactory) throws IOException, InterruptedException {
final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null, null), null);
final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
final Map<String, ShardSpec> sequenceNameToShardSpecMap = Maps.newHashMap();
if (toolbox.getMonitorScheduler() != null) {
toolbox.getMonitorScheduler().addMonitor(new RealtimeMetricsMonitor(ImmutableList.of(fireDepartmentForMetrics), ImmutableMap.of(DruidMetrics.TASK_ID, new String[] { getId() })));
}
final SegmentAllocator segmentAllocator;
if (ingestionSchema.getIOConfig().isAppendToExisting()) {
segmentAllocator = new ActionBasedSegmentAllocator(toolbox.getTaskActionClient(), dataSchema);
} else {
segmentAllocator = new SegmentAllocator() {
@Override
public SegmentIdentifier allocate(DateTime timestamp, String sequenceName, String previousSegmentId) throws IOException {
Optional<Interval> interval = granularitySpec.bucketInterval(timestamp);
if (!interval.isPresent()) {
throw new ISE("Could not find interval for timestamp [%s]", timestamp);
}
ShardSpec shardSpec = sequenceNameToShardSpecMap.get(sequenceName);
if (shardSpec == null) {
throw new ISE("Could not find ShardSpec for sequenceName [%s]", sequenceName);
}
return new SegmentIdentifier(getDataSource(), interval.get(), version, shardSpec);
}
};
}
try (final Appenderator appenderator = newAppenderator(fireDepartmentMetrics, toolbox, dataSchema);
final FiniteAppenderatorDriver driver = newDriver(appenderator, toolbox, segmentAllocator, fireDepartmentMetrics);
final Firehose firehose = firehoseFactory.connect(dataSchema.getParser())) {
final Supplier<Committer> committerSupplier = Committers.supplierFromFirehose(firehose);
final Map<Interval, ShardSpecLookup> shardSpecLookups = Maps.newHashMap();
if (driver.startJob() != null) {
driver.clear();
}
try {
while (firehose.hasMore()) {
try {
final InputRow inputRow = firehose.nextRow();
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
fireDepartmentMetrics.incrementThrownAway();
continue;
}
final Interval interval = optInterval.get();
if (!shardSpecLookups.containsKey(interval)) {
final List<ShardSpec> intervalShardSpecs = shardSpecs.get(interval);
if (intervalShardSpecs == null || intervalShardSpecs.isEmpty()) {
throw new ISE("Failed to get shardSpec for interval[%s]", interval);
}
shardSpecLookups.put(interval, intervalShardSpecs.get(0).getLookup(intervalShardSpecs));
}
final ShardSpec shardSpec = shardSpecLookups.get(interval).getShardSpec(inputRow.getTimestampFromEpoch(), inputRow);
final String sequenceName = String.format("index_%s_%s_%d", interval, version, shardSpec.getPartitionNum());
if (!sequenceNameToShardSpecMap.containsKey(sequenceName)) {
final ShardSpec shardSpecForPublishing = ingestionSchema.getTuningConfig().isForceExtendableShardSpecs() || ingestionSchema.getIOConfig().isAppendToExisting() ? new NumberedShardSpec(shardSpec.getPartitionNum(), shardSpecs.get(interval).size()) : shardSpec;
sequenceNameToShardSpecMap.put(sequenceName, shardSpecForPublishing);
}
final SegmentIdentifier identifier = driver.add(inputRow, sequenceName, committerSupplier);
if (identifier == null) {
throw new ISE("Could not allocate segment for row with timestamp[%s]", inputRow.getTimestamp());
}
fireDepartmentMetrics.incrementProcessed();
} catch (ParseException e) {
if (ingestionSchema.getTuningConfig().isReportParseExceptions()) {
throw e;
} else {
fireDepartmentMetrics.incrementUnparseable();
}
}
}
} finally {
driver.persist(committerSupplier.get());
}
final TransactionalSegmentPublisher publisher = new TransactionalSegmentPublisher() {
@Override
public boolean publishSegments(Set<DataSegment> segments, Object commitMetadata) throws IOException {
final SegmentTransactionalInsertAction action = new SegmentTransactionalInsertAction(segments, null, null);
return toolbox.getTaskActionClient().submit(action).isSuccess();
}
};
final SegmentsAndMetadata published = driver.finish(publisher, committerSupplier.get());
if (published == null) {
log.error("Failed to publish segments, aborting!");
return false;
} else {
log.info("Published segments[%s]", Joiner.on(", ").join(Iterables.transform(published.getSegments(), new Function<DataSegment, String>() {
@Override
public String apply(DataSegment input) {
return input.getIdentifier();
}
})));
return true;
}
}
}
use of io.druid.timeline.partition.ShardSpec in project druid by druid-io.
the class IndexTask method determineShardSpecs.
/**
* Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
* hash-based partitioning). In the future we may want to also support single-dimension partitioning.
*/
private Map<Interval, List<ShardSpec>> determineShardSpecs(final TaskToolbox toolbox, final FirehoseFactory firehoseFactory) throws IOException {
final ObjectMapper jsonMapper = toolbox.getObjectMapper();
final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
final boolean determineIntervals = !ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
final Map<Interval, List<ShardSpec>> shardSpecs = Maps.newHashMap();
// if we were given number of shards per interval and the intervals, we don't need to scan the data
if (!determineNumPartitions && !determineIntervals) {
log.info("numShards and intervals provided, skipping determine partition scan");
final SortedSet<Interval> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().get();
final int numShards = ingestionSchema.getTuningConfig().getNumShards();
for (Interval interval : intervals) {
final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
return shardSpecs;
}
// determine intervals containing data and prime HLL collectors
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = Maps.newHashMap();
int thrownAway = 0;
log.info("Determining intervals and shardSpecs");
long determineShardSpecsStartMillis = System.currentTimeMillis();
try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
while (firehose.hasMore()) {
final InputRow inputRow = firehose.nextRow();
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
if (!optInterval.isPresent()) {
thrownAway++;
continue;
}
interval = optInterval.get();
}
if (!determineNumPartitions) {
// for the interval and don't instantiate a HLL collector
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.<HyperLogLogCollector>absent());
}
continue;
}
if (!hllCollectors.containsKey(interval)) {
hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
}
List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
hllCollectors.get(interval).get().add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
}
}
if (thrownAway > 0) {
log.warn("Unable to to find a matching interval for [%,d] events", thrownAway);
}
final ImmutableSortedMap<Interval, Optional<HyperLogLogCollector>> sortedMap = ImmutableSortedMap.copyOf(hllCollectors, Comparators.intervalsByStartThenEnd());
for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : sortedMap.entrySet()) {
final Interval interval = entry.getKey();
final Optional<HyperLogLogCollector> collector = entry.getValue();
final int numShards;
if (determineNumPartitions) {
final long numRows = new Double(collector.get().estimateCardinality()).longValue();
numShards = (int) Math.ceil((double) numRows / ingestionSchema.getTuningConfig().getTargetPartitionSize());
log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
} else {
numShards = ingestionSchema.getTuningConfig().getNumShards();
log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
}
final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
if (numShards > 1) {
for (int i = 0; i < numShards; i++) {
intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
}
} else {
intervalShardSpecs.add(NoneShardSpec.instance());
}
shardSpecs.put(interval, intervalShardSpecs);
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return shardSpecs;
}
Aggregations