Search in sources :

Example 6 with TimelineObjectHolder

use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.

the class CompactionTask method createIngestionSchema.

/**
 * Generate {@link ParallelIndexIngestionSpec} from input segments.
 *
 * @return an empty list if input segments don't exist. Otherwise, a generated ingestionSpec.
 */
@VisibleForTesting
static List<ParallelIndexIngestionSpec> createIngestionSchema(final TaskToolbox toolbox, final LockGranularity lockGranularityInUse, final SegmentProvider segmentProvider, final PartitionConfigurationManager partitionConfigurationManager, @Nullable final DimensionsSpec dimensionsSpec, @Nullable final ClientCompactionTaskTransformSpec transformSpec, @Nullable final AggregatorFactory[] metricsSpec, @Nullable final ClientCompactionTaskGranularitySpec granularitySpec, final CoordinatorClient coordinatorClient, final SegmentCacheManagerFactory segmentCacheManagerFactory, final RetryPolicyFactory retryPolicyFactory, final boolean dropExisting) throws IOException, SegmentLoadingException {
    NonnullPair<Map<DataSegment, File>, List<TimelineObjectHolder<String, DataSegment>>> pair = prepareSegments(toolbox, segmentProvider, lockGranularityInUse);
    final Map<DataSegment, File> segmentFileMap = pair.lhs;
    final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = pair.rhs;
    if (timelineSegments.size() == 0) {
        return Collections.emptyList();
    }
    // find metadata for interval
    // queryableIndexAndSegments is sorted by the interval of the dataSegment
    final List<NonnullPair<QueryableIndex, DataSegment>> queryableIndexAndSegments = loadSegments(timelineSegments, segmentFileMap, toolbox.getIndexIO());
    final CompactionTuningConfig compactionTuningConfig = partitionConfigurationManager.computeTuningConfig();
    if (granularitySpec == null || granularitySpec.getSegmentGranularity() == null) {
        // original granularity
        final Map<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> intervalToSegments = new TreeMap<>(Comparators.intervalsByStartThenEnd());
        queryableIndexAndSegments.forEach(p -> intervalToSegments.computeIfAbsent(p.rhs.getInterval(), k -> new ArrayList<>()).add(p));
        // unify overlapping intervals to ensure overlapping segments compacting in the same indexSpec
        List<NonnullPair<Interval, List<NonnullPair<QueryableIndex, DataSegment>>>> intervalToSegmentsUnified = new ArrayList<>();
        Interval union = null;
        List<NonnullPair<QueryableIndex, DataSegment>> segments = new ArrayList<>();
        for (Entry<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> entry : intervalToSegments.entrySet()) {
            Interval cur = entry.getKey();
            if (union == null) {
                union = cur;
                segments.addAll(entry.getValue());
            } else if (union.overlaps(cur)) {
                union = Intervals.utc(union.getStartMillis(), Math.max(union.getEndMillis(), cur.getEndMillis()));
                segments.addAll(entry.getValue());
            } else {
                intervalToSegmentsUnified.add(new NonnullPair<>(union, segments));
                union = cur;
                segments = new ArrayList<>(entry.getValue());
            }
        }
        intervalToSegmentsUnified.add(new NonnullPair<>(union, segments));
        final List<ParallelIndexIngestionSpec> specs = new ArrayList<>(intervalToSegmentsUnified.size());
        for (NonnullPair<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> entry : intervalToSegmentsUnified) {
            final Interval interval = entry.lhs;
            final List<NonnullPair<QueryableIndex, DataSegment>> segmentsToCompact = entry.rhs;
            // If granularitySpec is not null, then set segmentGranularity. Otherwise,
            // creates new granularitySpec and set segmentGranularity
            Granularity segmentGranularityToUse = GranularityType.fromPeriod(interval.toPeriod()).getDefaultGranularity();
            final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, segmentsToCompact, dimensionsSpec, transformSpec, metricsSpec, granularitySpec == null ? new ClientCompactionTaskGranularitySpec(segmentGranularityToUse, null, null) : granularitySpec.withSegmentGranularity(segmentGranularityToUse));
            specs.add(new ParallelIndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, interval, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, dropExisting), compactionTuningConfig));
        }
        return specs;
    } else {
        // given segment granularity
        final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, queryableIndexAndSegments, dimensionsSpec, transformSpec, metricsSpec, granularitySpec);
        return Collections.singletonList(new ParallelIndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, segmentProvider.interval, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, dropExisting), compactionTuningConfig));
    }
}
Also used : ArrayList(java.util.ArrayList) LockGranularity(org.apache.druid.indexing.common.LockGranularity) Granularity(org.apache.druid.java.util.common.granularity.Granularity) DataSegment(org.apache.druid.timeline.DataSegment) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) NonnullPair(org.apache.druid.java.util.common.NonnullPair) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) TreeMap(java.util.TreeMap) ClientCompactionTaskGranularitySpec(org.apache.druid.client.indexing.ClientCompactionTaskGranularitySpec) DataSchema(org.apache.druid.segment.indexing.DataSchema) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) QueryableIndex(org.apache.druid.segment.QueryableIndex) Map(java.util.Map) BiMap(com.google.common.collect.BiMap) HashMap(java.util.HashMap) HashBiMap(com.google.common.collect.HashBiMap) TreeMap(java.util.TreeMap) File(java.io.File) Interval(org.joda.time.Interval) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 7 with TimelineObjectHolder

use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.

the class DruidInputSource method getTimelineForSegmentIds.

public static List<TimelineObjectHolder<String, DataSegment>> getTimelineForSegmentIds(CoordinatorClient coordinatorClient, String dataSource, List<WindowedSegmentId> segmentIds) {
    final SortedMap<Interval, TimelineObjectHolder<String, DataSegment>> timeline = new TreeMap<>(Comparators.intervalsByStartThenEnd());
    for (WindowedSegmentId windowedSegmentId : Preconditions.checkNotNull(segmentIds, "segmentIds")) {
        final DataSegment segment = coordinatorClient.fetchUsedSegment(dataSource, windowedSegmentId.getSegmentId());
        for (Interval interval : windowedSegmentId.getIntervals()) {
            final TimelineObjectHolder<String, DataSegment> existingHolder = timeline.get(interval);
            if (existingHolder != null) {
                if (!existingHolder.getVersion().equals(segment.getVersion())) {
                    throw new ISE("Timeline segments with the same interval should have the same version: " + "existing version[%s] vs new segment[%s]", existingHolder.getVersion(), segment);
                }
                existingHolder.getObject().add(segment.getShardSpec().createChunk(segment));
            } else {
                timeline.put(interval, new TimelineObjectHolder<>(interval, segment.getInterval(), segment.getVersion(), new PartitionHolder<>(segment.getShardSpec().createChunk(segment))));
            }
        }
    }
    // Validate that none of the given windows overlaps (except for when multiple segments share exactly the
    // same interval).
    Interval lastInterval = null;
    for (Interval interval : timeline.keySet()) {
        if (lastInterval != null && interval.overlaps(lastInterval)) {
            throw new IAE("Distinct intervals in input segments may not overlap: [%s] vs [%s]", lastInterval, interval);
        }
        lastInterval = interval;
    }
    return new ArrayList<>(timeline.values());
}
Also used : PartitionHolder(org.apache.druid.timeline.partition.PartitionHolder) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) IAE(org.apache.druid.java.util.common.IAE) DataSegment(org.apache.druid.timeline.DataSegment) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) ISE(org.apache.druid.java.util.common.ISE) Interval(org.joda.time.Interval)

Example 8 with TimelineObjectHolder

use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.

the class NumberedShardSpecTest method testVersionedIntervalTimelineBehaviorForNumberedShardSpec.

private void testVersionedIntervalTimelineBehaviorForNumberedShardSpec(List<PartitionChunk<OvershadowableString>> chunks, Set<OvershadowableString> expectedObjects) {
    VersionedIntervalTimeline<String, OvershadowableString> timeline = new VersionedIntervalTimeline<>(Ordering.natural());
    Interval interval = Intervals.of("2000/3000");
    String version = "v1";
    for (PartitionChunk<OvershadowableString> chunk : chunks) {
        timeline.add(interval, version, chunk);
    }
    Set<OvershadowableString> actualObjects = new HashSet<>();
    List<TimelineObjectHolder<String, OvershadowableString>> entries = timeline.lookup(interval);
    for (TimelineObjectHolder<String, OvershadowableString> entry : entries) {
        for (PartitionChunk<OvershadowableString> chunk : entry.getObject()) {
            actualObjects.add(chunk.getObject());
        }
    }
    Assert.assertEquals(expectedObjects, actualObjects);
}
Also used : TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) VersionedIntervalTimeline(org.apache.druid.timeline.VersionedIntervalTimeline) Interval(org.joda.time.Interval) HashSet(java.util.HashSet)

Example 9 with TimelineObjectHolder

use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.

the class DataSourceOptimizer method optimize.

/**
 * Do main work about materialized view selection: transform user query to one or more sub-queries.
 *
 * In the sub-query, the dataSource is the derivative of dataSource in user query, and sum of all sub-queries'
 * intervals equals the interval in user query
 *
 * Derived dataSource with smallest average data size per segment granularity have highest priority to replace the
 * datasource in user query
 *
 * @param query only TopNQuery/TimeseriesQuery/GroupByQuery can be optimized
 * @return a list of queries with specified derived dataSources and intervals
 */
public List<Query> optimize(Query query) {
    long start = System.currentTimeMillis();
    // only TableDataSource can be optimiezed
    if (!(query instanceof TopNQuery || query instanceof TimeseriesQuery || query instanceof GroupByQuery) || !(query.getDataSource() instanceof TableDataSource)) {
        return Collections.singletonList(query);
    }
    String datasourceName = ((TableDataSource) query.getDataSource()).getName();
    // get all derivatives for datasource in query. The derivatives set is sorted by average size of
    // per segment granularity.
    Set<DerivativeDataSource> derivatives = DerivativeDataSourceManager.getDerivatives(datasourceName);
    if (derivatives.isEmpty()) {
        return Collections.singletonList(query);
    }
    lock.readLock().lock();
    try {
        totalCount.computeIfAbsent(datasourceName, dsName -> new AtomicLong(0)).incrementAndGet();
        hitCount.putIfAbsent(datasourceName, new AtomicLong(0));
        AtomicLong costTimeOfDataSource = costTime.computeIfAbsent(datasourceName, dsName -> new AtomicLong(0));
        // get all fields which the query required
        Set<String> requiredFields = MaterializedViewUtils.getRequiredFields(query);
        Set<DerivativeDataSource> derivativesWithRequiredFields = new HashSet<>();
        for (DerivativeDataSource derivativeDataSource : derivatives) {
            derivativesHitCount.putIfAbsent(derivativeDataSource.getName(), new AtomicLong(0));
            if (derivativeDataSource.getColumns().containsAll(requiredFields)) {
                derivativesWithRequiredFields.add(derivativeDataSource);
            }
        }
        // if no derivatives contains all required dimensions, this materialized view selection failed.
        if (derivativesWithRequiredFields.isEmpty()) {
            missFields.computeIfAbsent(datasourceName, dsName -> new ConcurrentHashMap<>()).computeIfAbsent(requiredFields, rf -> new AtomicLong(0)).incrementAndGet();
            costTimeOfDataSource.addAndGet(System.currentTimeMillis() - start);
            return Collections.singletonList(query);
        }
        List<Query> queries = new ArrayList<>();
        List<Interval> remainingQueryIntervals = (List<Interval>) query.getIntervals();
        for (DerivativeDataSource derivativeDataSource : ImmutableSortedSet.copyOf(derivativesWithRequiredFields)) {
            final List<Interval> derivativeIntervals = remainingQueryIntervals.stream().flatMap(interval -> serverView.getTimeline(DataSourceAnalysis.forDataSource(new TableDataSource(derivativeDataSource.getName()))).orElseThrow(() -> new ISE("No timeline for dataSource: %s", derivativeDataSource.getName())).lookup(interval).stream().map(TimelineObjectHolder::getInterval)).collect(Collectors.toList());
            // not be selected.
            if (derivativeIntervals.isEmpty()) {
                continue;
            }
            remainingQueryIntervals = MaterializedViewUtils.minus(remainingQueryIntervals, derivativeIntervals);
            queries.add(query.withDataSource(new TableDataSource(derivativeDataSource.getName())).withQuerySegmentSpec(new MultipleIntervalSegmentSpec(derivativeIntervals)));
            derivativesHitCount.get(derivativeDataSource.getName()).incrementAndGet();
            if (remainingQueryIntervals.isEmpty()) {
                break;
            }
        }
        if (queries.isEmpty()) {
            costTime.get(datasourceName).addAndGet(System.currentTimeMillis() - start);
            return Collections.singletonList(query);
        }
        // the original datasource.
        if (!remainingQueryIntervals.isEmpty()) {
            queries.add(query.withQuerySegmentSpec(new MultipleIntervalSegmentSpec(remainingQueryIntervals)));
        }
        hitCount.get(datasourceName).incrementAndGet();
        costTime.get(datasourceName).addAndGet(System.currentTimeMillis() - start);
        return queries;
    } finally {
        lock.readLock().unlock();
    }
}
Also used : DataSourceAnalysis(org.apache.druid.query.planning.DataSourceAnalysis) Inject(com.google.inject.Inject) HashMap(java.util.HashMap) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) TopNQuery(org.apache.druid.query.topn.TopNQuery) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(org.joda.time.Interval) Query(org.apache.druid.query.Query) Map(java.util.Map) MultipleIntervalSegmentSpec(org.apache.druid.query.spec.MultipleIntervalSegmentSpec) GroupByQuery(org.apache.druid.query.groupby.GroupByQuery) TimelineServerView(org.apache.druid.client.TimelineServerView) ReadWriteLock(java.util.concurrent.locks.ReadWriteLock) ImmutableSortedSet(com.google.common.collect.ImmutableSortedSet) ImmutableMap(com.google.common.collect.ImmutableMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) TimeseriesQuery(org.apache.druid.query.timeseries.TimeseriesQuery) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) Set(java.util.Set) ISE(org.apache.druid.java.util.common.ISE) Collectors(java.util.stream.Collectors) TableDataSource(org.apache.druid.query.TableDataSource) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) Collections(java.util.Collections) TimeseriesQuery(org.apache.druid.query.timeseries.TimeseriesQuery) TopNQuery(org.apache.druid.query.topn.TopNQuery) Query(org.apache.druid.query.Query) GroupByQuery(org.apache.druid.query.groupby.GroupByQuery) TimeseriesQuery(org.apache.druid.query.timeseries.TimeseriesQuery) ArrayList(java.util.ArrayList) MultipleIntervalSegmentSpec(org.apache.druid.query.spec.MultipleIntervalSegmentSpec) GroupByQuery(org.apache.druid.query.groupby.GroupByQuery) AtomicLong(java.util.concurrent.atomic.AtomicLong) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) TableDataSource(org.apache.druid.query.TableDataSource) TopNQuery(org.apache.druid.query.topn.TopNQuery) ArrayList(java.util.ArrayList) List(java.util.List) ISE(org.apache.druid.java.util.common.ISE) HashSet(java.util.HashSet) Interval(org.joda.time.Interval)

Example 10 with TimelineObjectHolder

use of org.apache.druid.timeline.TimelineObjectHolder in project hive by apache.

the class DruidStorageHandlerUtils method publishSegmentsAndCommit.

/**
 * First computes the segments timeline to accommodate new segments for insert into case.
 * Then moves segments to druid deep storage with updated metadata/version.
 * ALL IS DONE IN ONE TRANSACTION
 *
 * @param connector                   DBI connector to commit
 * @param metadataStorageTablesConfig Druid metadata tables definitions
 * @param dataSource                  Druid datasource name
 * @param segments                    List of segments to move and commit to metadata
 * @param overwrite                   if it is an insert overwrite
 * @param conf                        Configuration
 * @param dataSegmentPusher           segment pusher
 * @return List of successfully published Druid segments.
 * This list has the updated versions and metadata about segments after move and timeline sorting
 * @throws CallbackFailedException in case the connector can not add the segment to the DB.
 */
@SuppressWarnings("unchecked")
static List<DataSegment> publishSegmentsAndCommit(final SQLMetadataConnector connector, final MetadataStorageTablesConfig metadataStorageTablesConfig, final String dataSource, final List<DataSegment> segments, boolean overwrite, Configuration conf, DataSegmentPusher dataSegmentPusher) throws CallbackFailedException {
    return connector.getDBI().inTransaction((handle, transactionStatus) -> {
        // We create the timeline for the existing and new segments
        VersionedIntervalTimeline<String, DataSegment> timeline;
        if (overwrite) {
            // If we are overwriting, we disable existing sources
            disableDataSourceWithHandle(handle, metadataStorageTablesConfig, dataSource);
            // When overwriting, we just start with empty timeline,
            // as we are overwriting segments with new versions
            timeline = new VersionedIntervalTimeline<>(Ordering.natural());
        } else {
            // Append Mode
            if (segments.isEmpty()) {
                // If there are no new segments, we can just bail out
                return Collections.EMPTY_LIST;
            }
            // Otherwise, build a timeline of existing segments in metadata storage
            Interval indexedInterval = JodaUtils.umbrellaInterval(segments.stream().map(DataSegment::getInterval).collect(Collectors.toList()));
            LOG.info("Building timeline for umbrella Interval [{}]", indexedInterval);
            timeline = getTimelineForIntervalWithHandle(handle, dataSource, indexedInterval, metadataStorageTablesConfig);
        }
        final List<DataSegment> finalSegmentsToPublish = Lists.newArrayList();
        for (DataSegment segment : segments) {
            List<TimelineObjectHolder<String, DataSegment>> existingChunks = timeline.lookup(segment.getInterval());
            if (existingChunks.size() > 1) {
                // Druid shard specs does not support multiple partitions for same interval with different granularity.
                throw new IllegalStateException(String.format("Cannot allocate new segment for dataSource[%s], interval[%s], already have [%,d] chunks. " + "Not possible to append new segment.", dataSource, segment.getInterval(), existingChunks.size()));
            }
            // Find out the segment with latest version and maximum partition number
            SegmentIdWithShardSpec max = null;
            final ShardSpec newShardSpec;
            final String newVersion;
            if (!existingChunks.isEmpty()) {
                // Some existing chunk, Find max
                TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
                for (PartitionChunk<DataSegment> existing : existingHolder.getObject()) {
                    if (max == null || max.getShardSpec().getPartitionNum() < existing.getObject().getShardSpec().getPartitionNum()) {
                        max = SegmentIdWithShardSpec.fromDataSegment(existing.getObject());
                    }
                }
            }
            if (max == null) {
                // No existing shard present in the database, use the current version.
                newShardSpec = segment.getShardSpec();
                newVersion = segment.getVersion();
            } else {
                // use version of existing max segment to generate new shard spec
                newShardSpec = getNextPartitionShardSpec(max.getShardSpec());
                newVersion = max.getVersion();
            }
            DataSegment publishedSegment = publishSegmentWithShardSpec(segment, newShardSpec, newVersion, getPath(segment).getFileSystem(conf), dataSegmentPusher);
            finalSegmentsToPublish.add(publishedSegment);
            timeline.add(publishedSegment.getInterval(), publishedSegment.getVersion(), publishedSegment.getShardSpec().createChunk(publishedSegment));
        }
        // Publish new segments to metadata storage
        final PreparedBatch batch = handle.prepareBatch(String.format("INSERT INTO %1$s (id, dataSource, created_date, start, \"end\", partitioned, version, used, payload) " + "VALUES (:id, :dataSource, :created_date, :start, :end, :partitioned, :version, :used, :payload)", metadataStorageTablesConfig.getSegmentsTable()));
        for (final DataSegment segment : finalSegmentsToPublish) {
            batch.add(new ImmutableMap.Builder<String, Object>().put("id", segment.getId().toString()).put("dataSource", segment.getDataSource()).put("created_date", new DateTime().toString()).put("start", segment.getInterval().getStart().toString()).put("end", segment.getInterval().getEnd().toString()).put("partitioned", !(segment.getShardSpec() instanceof NoneShardSpec)).put("version", segment.getVersion()).put("used", true).put("payload", JSON_MAPPER.writeValueAsBytes(segment)).build());
            LOG.info("Published {}", segment.getId().toString());
        }
        batch.execute();
        return finalSegmentsToPublish;
    });
}
Also used : NoneShardSpec(org.apache.druid.timeline.partition.NoneShardSpec) GenericUDFToString(org.apache.hadoop.hive.ql.udf.generic.GenericUDFToString) DataSegment(org.apache.druid.timeline.DataSegment) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) LinearShardSpec(org.apache.druid.timeline.partition.LinearShardSpec) NoneShardSpec(org.apache.druid.timeline.partition.NoneShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) SegmentIdWithShardSpec(org.apache.druid.segment.realtime.appenderator.SegmentIdWithShardSpec) ImmutableMap(com.google.common.collect.ImmutableMap) DateTime(org.joda.time.DateTime) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) PreparedBatch(org.skife.jdbi.v2.PreparedBatch) Interval(org.joda.time.Interval)

Aggregations

TimelineObjectHolder (org.apache.druid.timeline.TimelineObjectHolder)28 DataSegment (org.apache.druid.timeline.DataSegment)23 Interval (org.joda.time.Interval)18 ArrayList (java.util.ArrayList)14 Test (org.junit.Test)12 List (java.util.List)11 TableDataSource (org.apache.druid.query.TableDataSource)10 Map (java.util.Map)9 IOException (java.io.IOException)8 HashSet (java.util.HashSet)8 CountDownLatch (java.util.concurrent.CountDownLatch)8 VersionedIntervalTimeline (org.apache.druid.timeline.VersionedIntervalTimeline)8 ImmutableList (com.google.common.collect.ImmutableList)7 ISE (org.apache.druid.java.util.common.ISE)7 PartitionChunk (org.apache.druid.timeline.partition.PartitionChunk)7 Preconditions (com.google.common.base.Preconditions)6 Collections (java.util.Collections)6 HashMap (java.util.HashMap)6 Logger (org.apache.druid.java.util.common.logger.Logger)6 File (java.io.File)5