use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class CompactionTask method createIngestionSchema.
/**
* Generate {@link ParallelIndexIngestionSpec} from input segments.
*
* @return an empty list if input segments don't exist. Otherwise, a generated ingestionSpec.
*/
@VisibleForTesting
static List<ParallelIndexIngestionSpec> createIngestionSchema(final TaskToolbox toolbox, final LockGranularity lockGranularityInUse, final SegmentProvider segmentProvider, final PartitionConfigurationManager partitionConfigurationManager, @Nullable final DimensionsSpec dimensionsSpec, @Nullable final ClientCompactionTaskTransformSpec transformSpec, @Nullable final AggregatorFactory[] metricsSpec, @Nullable final ClientCompactionTaskGranularitySpec granularitySpec, final CoordinatorClient coordinatorClient, final SegmentCacheManagerFactory segmentCacheManagerFactory, final RetryPolicyFactory retryPolicyFactory, final boolean dropExisting) throws IOException, SegmentLoadingException {
NonnullPair<Map<DataSegment, File>, List<TimelineObjectHolder<String, DataSegment>>> pair = prepareSegments(toolbox, segmentProvider, lockGranularityInUse);
final Map<DataSegment, File> segmentFileMap = pair.lhs;
final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = pair.rhs;
if (timelineSegments.size() == 0) {
return Collections.emptyList();
}
// find metadata for interval
// queryableIndexAndSegments is sorted by the interval of the dataSegment
final List<NonnullPair<QueryableIndex, DataSegment>> queryableIndexAndSegments = loadSegments(timelineSegments, segmentFileMap, toolbox.getIndexIO());
final CompactionTuningConfig compactionTuningConfig = partitionConfigurationManager.computeTuningConfig();
if (granularitySpec == null || granularitySpec.getSegmentGranularity() == null) {
// original granularity
final Map<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> intervalToSegments = new TreeMap<>(Comparators.intervalsByStartThenEnd());
queryableIndexAndSegments.forEach(p -> intervalToSegments.computeIfAbsent(p.rhs.getInterval(), k -> new ArrayList<>()).add(p));
// unify overlapping intervals to ensure overlapping segments compacting in the same indexSpec
List<NonnullPair<Interval, List<NonnullPair<QueryableIndex, DataSegment>>>> intervalToSegmentsUnified = new ArrayList<>();
Interval union = null;
List<NonnullPair<QueryableIndex, DataSegment>> segments = new ArrayList<>();
for (Entry<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> entry : intervalToSegments.entrySet()) {
Interval cur = entry.getKey();
if (union == null) {
union = cur;
segments.addAll(entry.getValue());
} else if (union.overlaps(cur)) {
union = Intervals.utc(union.getStartMillis(), Math.max(union.getEndMillis(), cur.getEndMillis()));
segments.addAll(entry.getValue());
} else {
intervalToSegmentsUnified.add(new NonnullPair<>(union, segments));
union = cur;
segments = new ArrayList<>(entry.getValue());
}
}
intervalToSegmentsUnified.add(new NonnullPair<>(union, segments));
final List<ParallelIndexIngestionSpec> specs = new ArrayList<>(intervalToSegmentsUnified.size());
for (NonnullPair<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> entry : intervalToSegmentsUnified) {
final Interval interval = entry.lhs;
final List<NonnullPair<QueryableIndex, DataSegment>> segmentsToCompact = entry.rhs;
// If granularitySpec is not null, then set segmentGranularity. Otherwise,
// creates new granularitySpec and set segmentGranularity
Granularity segmentGranularityToUse = GranularityType.fromPeriod(interval.toPeriod()).getDefaultGranularity();
final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, segmentsToCompact, dimensionsSpec, transformSpec, metricsSpec, granularitySpec == null ? new ClientCompactionTaskGranularitySpec(segmentGranularityToUse, null, null) : granularitySpec.withSegmentGranularity(segmentGranularityToUse));
specs.add(new ParallelIndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, interval, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, dropExisting), compactionTuningConfig));
}
return specs;
} else {
// given segment granularity
final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, queryableIndexAndSegments, dimensionsSpec, transformSpec, metricsSpec, granularitySpec);
return Collections.singletonList(new ParallelIndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, segmentProvider.interval, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, dropExisting), compactionTuningConfig));
}
}
use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class DruidInputSource method getTimelineForSegmentIds.
public static List<TimelineObjectHolder<String, DataSegment>> getTimelineForSegmentIds(CoordinatorClient coordinatorClient, String dataSource, List<WindowedSegmentId> segmentIds) {
final SortedMap<Interval, TimelineObjectHolder<String, DataSegment>> timeline = new TreeMap<>(Comparators.intervalsByStartThenEnd());
for (WindowedSegmentId windowedSegmentId : Preconditions.checkNotNull(segmentIds, "segmentIds")) {
final DataSegment segment = coordinatorClient.fetchUsedSegment(dataSource, windowedSegmentId.getSegmentId());
for (Interval interval : windowedSegmentId.getIntervals()) {
final TimelineObjectHolder<String, DataSegment> existingHolder = timeline.get(interval);
if (existingHolder != null) {
if (!existingHolder.getVersion().equals(segment.getVersion())) {
throw new ISE("Timeline segments with the same interval should have the same version: " + "existing version[%s] vs new segment[%s]", existingHolder.getVersion(), segment);
}
existingHolder.getObject().add(segment.getShardSpec().createChunk(segment));
} else {
timeline.put(interval, new TimelineObjectHolder<>(interval, segment.getInterval(), segment.getVersion(), new PartitionHolder<>(segment.getShardSpec().createChunk(segment))));
}
}
}
// Validate that none of the given windows overlaps (except for when multiple segments share exactly the
// same interval).
Interval lastInterval = null;
for (Interval interval : timeline.keySet()) {
if (lastInterval != null && interval.overlaps(lastInterval)) {
throw new IAE("Distinct intervals in input segments may not overlap: [%s] vs [%s]", lastInterval, interval);
}
lastInterval = interval;
}
return new ArrayList<>(timeline.values());
}
use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class NumberedShardSpecTest method testVersionedIntervalTimelineBehaviorForNumberedShardSpec.
private void testVersionedIntervalTimelineBehaviorForNumberedShardSpec(List<PartitionChunk<OvershadowableString>> chunks, Set<OvershadowableString> expectedObjects) {
VersionedIntervalTimeline<String, OvershadowableString> timeline = new VersionedIntervalTimeline<>(Ordering.natural());
Interval interval = Intervals.of("2000/3000");
String version = "v1";
for (PartitionChunk<OvershadowableString> chunk : chunks) {
timeline.add(interval, version, chunk);
}
Set<OvershadowableString> actualObjects = new HashSet<>();
List<TimelineObjectHolder<String, OvershadowableString>> entries = timeline.lookup(interval);
for (TimelineObjectHolder<String, OvershadowableString> entry : entries) {
for (PartitionChunk<OvershadowableString> chunk : entry.getObject()) {
actualObjects.add(chunk.getObject());
}
}
Assert.assertEquals(expectedObjects, actualObjects);
}
use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class DataSourceOptimizer method optimize.
/**
* Do main work about materialized view selection: transform user query to one or more sub-queries.
*
* In the sub-query, the dataSource is the derivative of dataSource in user query, and sum of all sub-queries'
* intervals equals the interval in user query
*
* Derived dataSource with smallest average data size per segment granularity have highest priority to replace the
* datasource in user query
*
* @param query only TopNQuery/TimeseriesQuery/GroupByQuery can be optimized
* @return a list of queries with specified derived dataSources and intervals
*/
public List<Query> optimize(Query query) {
long start = System.currentTimeMillis();
// only TableDataSource can be optimiezed
if (!(query instanceof TopNQuery || query instanceof TimeseriesQuery || query instanceof GroupByQuery) || !(query.getDataSource() instanceof TableDataSource)) {
return Collections.singletonList(query);
}
String datasourceName = ((TableDataSource) query.getDataSource()).getName();
// get all derivatives for datasource in query. The derivatives set is sorted by average size of
// per segment granularity.
Set<DerivativeDataSource> derivatives = DerivativeDataSourceManager.getDerivatives(datasourceName);
if (derivatives.isEmpty()) {
return Collections.singletonList(query);
}
lock.readLock().lock();
try {
totalCount.computeIfAbsent(datasourceName, dsName -> new AtomicLong(0)).incrementAndGet();
hitCount.putIfAbsent(datasourceName, new AtomicLong(0));
AtomicLong costTimeOfDataSource = costTime.computeIfAbsent(datasourceName, dsName -> new AtomicLong(0));
// get all fields which the query required
Set<String> requiredFields = MaterializedViewUtils.getRequiredFields(query);
Set<DerivativeDataSource> derivativesWithRequiredFields = new HashSet<>();
for (DerivativeDataSource derivativeDataSource : derivatives) {
derivativesHitCount.putIfAbsent(derivativeDataSource.getName(), new AtomicLong(0));
if (derivativeDataSource.getColumns().containsAll(requiredFields)) {
derivativesWithRequiredFields.add(derivativeDataSource);
}
}
// if no derivatives contains all required dimensions, this materialized view selection failed.
if (derivativesWithRequiredFields.isEmpty()) {
missFields.computeIfAbsent(datasourceName, dsName -> new ConcurrentHashMap<>()).computeIfAbsent(requiredFields, rf -> new AtomicLong(0)).incrementAndGet();
costTimeOfDataSource.addAndGet(System.currentTimeMillis() - start);
return Collections.singletonList(query);
}
List<Query> queries = new ArrayList<>();
List<Interval> remainingQueryIntervals = (List<Interval>) query.getIntervals();
for (DerivativeDataSource derivativeDataSource : ImmutableSortedSet.copyOf(derivativesWithRequiredFields)) {
final List<Interval> derivativeIntervals = remainingQueryIntervals.stream().flatMap(interval -> serverView.getTimeline(DataSourceAnalysis.forDataSource(new TableDataSource(derivativeDataSource.getName()))).orElseThrow(() -> new ISE("No timeline for dataSource: %s", derivativeDataSource.getName())).lookup(interval).stream().map(TimelineObjectHolder::getInterval)).collect(Collectors.toList());
// not be selected.
if (derivativeIntervals.isEmpty()) {
continue;
}
remainingQueryIntervals = MaterializedViewUtils.minus(remainingQueryIntervals, derivativeIntervals);
queries.add(query.withDataSource(new TableDataSource(derivativeDataSource.getName())).withQuerySegmentSpec(new MultipleIntervalSegmentSpec(derivativeIntervals)));
derivativesHitCount.get(derivativeDataSource.getName()).incrementAndGet();
if (remainingQueryIntervals.isEmpty()) {
break;
}
}
if (queries.isEmpty()) {
costTime.get(datasourceName).addAndGet(System.currentTimeMillis() - start);
return Collections.singletonList(query);
}
// the original datasource.
if (!remainingQueryIntervals.isEmpty()) {
queries.add(query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(remainingQueryIntervals)));
}
hitCount.get(datasourceName).incrementAndGet();
costTime.get(datasourceName).addAndGet(System.currentTimeMillis() - start);
return queries;
} finally {
lock.readLock().unlock();
}
}
use of org.apache.druid.timeline.TimelineObjectHolder in project hive by apache.
the class DruidStorageHandlerUtils method publishSegmentsAndCommit.
/**
* First computes the segments timeline to accommodate new segments for insert into case.
* Then moves segments to druid deep storage with updated metadata/version.
* ALL IS DONE IN ONE TRANSACTION
*
* @param connector DBI connector to commit
* @param metadataStorageTablesConfig Druid metadata tables definitions
* @param dataSource Druid datasource name
* @param segments List of segments to move and commit to metadata
* @param overwrite if it is an insert overwrite
* @param conf Configuration
* @param dataSegmentPusher segment pusher
* @return List of successfully published Druid segments.
* This list has the updated versions and metadata about segments after move and timeline sorting
* @throws CallbackFailedException in case the connector can not add the segment to the DB.
*/
@SuppressWarnings("unchecked")
static List<DataSegment> publishSegmentsAndCommit(final SQLMetadataConnector connector, final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource, final List<DataSegment> segments, boolean overwrite, Configuration conf, DataSegmentPusher dataSegmentPusher) throws CallbackFailedException {
return connector.getDBI().inTransaction((handle, transactionStatus) -> {
// We create the timeline for the existing and new segments
VersionedIntervalTimeline<String, DataSegment> timeline;
if (overwrite) {
// If we are overwriting, we disable existing sources
disableDataSourceWithHandle(handle, metadataStorageTablesConfig, dataSource);
// When overwriting, we just start with empty timeline,
// as we are overwriting segments with new versions
timeline = new VersionedIntervalTimeline<>(Ordering.natural());
} else {
// Append Mode
if (segments.isEmpty()) {
// If there are no new segments, we can just bail out
return Collections.EMPTY_LIST;
}
// Otherwise, build a timeline of existing segments in metadata storage
Interval indexedInterval = JodaUtils.umbrellaInterval(segments.stream().map(DataSegment::getInterval).collect(Collectors.toList()));
LOG.info("Building timeline for umbrella Interval [{}]", indexedInterval);
timeline = getTimelineForIntervalWithHandle(handle, dataSource, indexedInterval, metadataStorageTablesConfig);
}
final List<DataSegment> finalSegmentsToPublish = Lists.newArrayList();
for (DataSegment segment : segments) {
List<TimelineObjectHolder<String, DataSegment>> existingChunks = timeline.lookup(segment.getInterval());
if (existingChunks.size() > 1) {
// Druid shard specs does not support multiple partitions for same interval with different granularity.
throw new IllegalStateException(String.format("Cannot allocate new segment for dataSource[%s], interval[%s], already have [%,d] chunks. " + "Not possible to append new segment.", dataSource, segment.getInterval(), existingChunks.size()));
}
// Find out the segment with latest version and maximum partition number
SegmentIdWithShardSpec max = null;
final ShardSpec newShardSpec;
final String newVersion;
if (!existingChunks.isEmpty()) {
// Some existing chunk, Find max
TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) {
if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject().getShardSpec().getPartitionNum()) {
max = SegmentIdWithShardSpec.fromDataSegment(existing.getObject());
}
}
}
if (max == null) {
// No existing shard present in the database, use the current version.
newShardSpec = segment.getShardSpec();
newVersion = segment.getVersion();
} else {
// use version of existing max segment to generate new shard spec
newShardSpec = getNextPartitionShardSpec(max.getShardSpec());
newVersion = max.getVersion();
}
DataSegment publishedSegment = publishSegmentWithShardSpec(segment, newShardSpec, newVersion, getPath(segment).getFileSystem(conf), dataSegmentPusher);
finalSegmentsToPublish.add(publishedSegment);
timeline.add(publishedSegment.getInterval(), publishedSegment.getVersion(), publishedSegment.getShardSpec().createChunk(publishedSegment));
}
// Publish new segments to metadata storage
final PreparedBatch batch = handle.prepareBatch(String.format("INSERT INTO %1$s (id, dataSource, created_date, start, \"end\", partitioned, version, used, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :partitioned, :version, :used, :payload)", metadataStorageTablesConfig.getSegmentsTable()));
for (final DataSegment segment : finalSegmentsToPublish) {
batch.add(new ImmutableMap.Builder<String, Object>().put("id", segment.getId().toString()).put("dataSource", segment.getDataSource()).put("created_date", new DateTime().toString()).put("start", segment.getInterval().getStart().toString()).put("end", segment.getInterval().getEnd().toString()).put("partitioned", !(segment.getShardSpec() instanceof NoneShardSpec)).put("version", segment.getVersion()).put("used", true).put("payload", JSON_MAPPER.writeValueAsBytes(segment)).build());
LOG.info("Published {}", segment.getId().toString());
}
batch.execute();
return finalSegmentsToPublish;
});
}
Aggregations