Search in sources :

Example 1 with WindowedSegmentId

use of org.apache.druid.indexing.firehose.WindowedSegmentId in project druid by druid-io.

the class DruidInputSource method getTimelineForSegmentIds.

public static List<TimelineObjectHolder<String, DataSegment>> getTimelineForSegmentIds(CoordinatorClient coordinatorClient, String dataSource, List<WindowedSegmentId> segmentIds) {
    final SortedMap<Interval, TimelineObjectHolder<String, DataSegment>> timeline = new TreeMap<>(Comparators.intervalsByStartThenEnd());
    for (WindowedSegmentId windowedSegmentId : Preconditions.checkNotNull(segmentIds, "segmentIds")) {
        final DataSegment segment = coordinatorClient.fetchUsedSegment(dataSource, windowedSegmentId.getSegmentId());
        for (Interval interval : windowedSegmentId.getIntervals()) {
            final TimelineObjectHolder<String, DataSegment> existingHolder = timeline.get(interval);
            if (existingHolder != null) {
                if (!existingHolder.getVersion().equals(segment.getVersion())) {
                    throw new ISE("Timeline segments with the same interval should have the same version: " + "existing version[%s] vs new segment[%s]", existingHolder.getVersion(), segment);
                }
                existingHolder.getObject().add(segment.getShardSpec().createChunk(segment));
            } else {
                timeline.put(interval, new TimelineObjectHolder<>(interval, segment.getInterval(), segment.getVersion(), new PartitionHolder<>(segment.getShardSpec().createChunk(segment))));
            }
        }
    }
    // Validate that none of the given windows overlaps (except for when multiple segments share exactly the
    // same interval).
    Interval lastInterval = null;
    for (Interval interval : timeline.keySet()) {
        if (lastInterval != null && interval.overlaps(lastInterval)) {
            throw new IAE("Distinct intervals in input segments may not overlap: [%s] vs [%s]", lastInterval, interval);
        }
        lastInterval = interval;
    }
    return new ArrayList<>(timeline.values());
}
Also used : PartitionHolder(org.apache.druid.timeline.partition.PartitionHolder) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) IAE(org.apache.druid.java.util.common.IAE) DataSegment(org.apache.druid.timeline.DataSegment) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) ISE(org.apache.druid.java.util.common.ISE) Interval(org.joda.time.Interval)

Example 2 with WindowedSegmentId

use of org.apache.druid.indexing.firehose.WindowedSegmentId in project druid by druid-io.

the class DruidInputSource method createSplits.

public static Iterator<InputSplit<List<WindowedSegmentId>>> createSplits(CoordinatorClient coordinatorClient, RetryPolicyFactory retryPolicyFactory, String dataSource, Interval interval, SplitHintSpec splitHintSpec) {
    final SplitHintSpec convertedSplitHintSpec;
    if (splitHintSpec instanceof SegmentsSplitHintSpec) {
        final SegmentsSplitHintSpec segmentsSplitHintSpec = (SegmentsSplitHintSpec) splitHintSpec;
        convertedSplitHintSpec = new MaxSizeSplitHintSpec(segmentsSplitHintSpec.getMaxInputSegmentBytesPerTask(), segmentsSplitHintSpec.getMaxNumSegments());
    } else {
        convertedSplitHintSpec = splitHintSpec;
    }
    final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = getTimelineForInterval(coordinatorClient, retryPolicyFactory, dataSource, interval);
    final Map<WindowedSegmentId, Long> segmentIdToSize = createWindowedSegmentIdFromTimeline(timelineSegments);
    // noinspection ConstantConditions
    return Iterators.transform(convertedSplitHintSpec.split(// the same input split.
    segmentIdToSize.keySet().iterator(), segmentId -> new InputFileAttribute(Preconditions.checkNotNull(segmentIdToSize.get(segmentId), "segment size for [%s]", segmentId))), InputSplit::new);
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) SegmentCacheManagerFactory(org.apache.druid.indexing.common.SegmentCacheManagerFactory) TaskConfig(org.apache.druid.indexing.common.config.TaskConfig) Comparators(org.apache.druid.java.util.common.guava.Comparators) AbstractInputSource(org.apache.druid.data.input.AbstractInputSource) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) FluentIterable(com.google.common.collect.FluentIterable) Map(java.util.Map) InputSourceReader(org.apache.druid.data.input.InputSourceReader) IAE(org.apache.druid.java.util.common.IAE) JacksonInject(com.fasterxml.jackson.annotation.JacksonInject) RetryPolicyFactory(org.apache.druid.indexing.common.RetryPolicyFactory) InputFormat(org.apache.druid.data.input.InputFormat) Collection(java.util.Collection) SplitHintSpec(org.apache.druid.data.input.SplitHintSpec) SplittableInputSource(org.apache.druid.data.input.impl.SplittableInputSource) ISE(org.apache.druid.java.util.common.ISE) Objects(java.util.Objects) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec) PartitionHolder(org.apache.druid.timeline.partition.PartitionHolder) List(java.util.List) Stream(java.util.stream.Stream) DimFilter(org.apache.druid.query.filter.DimFilter) DataSegment(org.apache.druid.timeline.DataSegment) SortedMap(java.util.SortedMap) Logger(org.apache.druid.java.util.common.logger.Logger) Streams(org.apache.druid.utils.Streams) InputSplit(org.apache.druid.data.input.InputSplit) Duration(org.joda.time.Duration) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) HashMap(java.util.HashMap) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Iterators(com.google.common.collect.Iterators) ArrayList(java.util.ArrayList) PartitionChunk(org.apache.druid.timeline.partition.PartitionChunk) Interval(org.joda.time.Interval) ColumnHolder(org.apache.druid.segment.column.ColumnHolder) ImmutableList(com.google.common.collect.ImmutableList) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) CoordinatorClient(org.apache.druid.client.coordinator.CoordinatorClient) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) Nullable(javax.annotation.Nullable) RetryPolicy(org.apache.druid.indexing.common.RetryPolicy) VersionedIntervalTimeline(org.apache.druid.timeline.VersionedIntervalTimeline) Iterator(java.util.Iterator) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) File(java.io.File) InputEntityIteratingReader(org.apache.druid.data.input.impl.InputEntityIteratingReader) TreeMap(java.util.TreeMap) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) JsonInclude(com.fasterxml.jackson.annotation.JsonInclude) Preconditions(com.google.common.base.Preconditions) SegmentCacheManager(org.apache.druid.segment.loading.SegmentCacheManager) Comparator(java.util.Comparator) IndexIO(org.apache.druid.segment.IndexIO) Collections(java.util.Collections) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) InputSplit(org.apache.druid.data.input.InputSplit) SplitHintSpec(org.apache.druid.data.input.SplitHintSpec) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec)

Example 3 with WindowedSegmentId

use of org.apache.druid.indexing.firehose.WindowedSegmentId in project druid by druid-io.

the class AbstractBatchIndexTask method findInputSegments.

/**
 * If the given firehoseFactory is {@link IngestSegmentFirehoseFactory}, then it finds the segments to lock
 * from the firehoseFactory. This is because those segments will be read by this task no matter what segments would be
 * filtered by intervalsToRead, so they need to be locked.
 * <p>
 * However, firehoseFactory is not IngestSegmentFirehoseFactory, it means this task will overwrite some segments
 * with data read from some input source outside of Druid. As a result, only the segments falling in intervalsToRead
 * should be locked.
 * <p>
 * The order of segments within the returned list is unspecified, but each segment is guaranteed to appear in the list
 * only once.
 */
protected static List<DataSegment> findInputSegments(String dataSource, TaskActionClient actionClient, List<Interval> intervalsToRead, FirehoseFactory firehoseFactory) throws IOException {
    if (firehoseFactory instanceof IngestSegmentFirehoseFactory) {
        // intervalsToRead is ignored here.
        final List<WindowedSegmentId> inputSegments = ((IngestSegmentFirehoseFactory) firehoseFactory).getSegments();
        if (inputSegments == null) {
            final Interval inputInterval = Preconditions.checkNotNull(((IngestSegmentFirehoseFactory) firehoseFactory).getInterval(), "input interval");
            return ImmutableList.copyOf(actionClient.submit(new RetrieveUsedSegmentsAction(dataSource, inputInterval, null, Segments.ONLY_VISIBLE)));
        } else {
            final List<String> inputSegmentIds = inputSegments.stream().map(WindowedSegmentId::getSegmentId).collect(Collectors.toList());
            final Collection<DataSegment> dataSegmentsInIntervals = actionClient.submit(new RetrieveUsedSegmentsAction(dataSource, null, inputSegments.stream().flatMap(windowedSegmentId -> windowedSegmentId.getIntervals().stream()).collect(Collectors.toSet()), Segments.ONLY_VISIBLE));
            return dataSegmentsInIntervals.stream().filter(segment -> inputSegmentIds.contains(segment.getId().toString())).collect(Collectors.toList());
        }
    } else {
        return ImmutableList.copyOf(actionClient.submit(new RetrieveUsedSegmentsAction(dataSource, null, intervalsToRead, Segments.ONLY_VISIBLE)));
    }
}
Also used : IngestSegmentFirehoseFactory(org.apache.druid.indexing.firehose.IngestSegmentFirehoseFactory) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) TaskConfig(org.apache.druid.indexing.common.config.TaskConfig) IndexIOConfig(org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig) Partitions(org.apache.druid.timeline.Partitions) CompactionState(org.apache.druid.timeline.CompactionState) JodaUtils(org.apache.druid.java.util.common.JodaUtils) TaskActionClient(org.apache.druid.indexing.common.actions.TaskActionClient) Optional(com.google.common.base.Optional) Map(java.util.Map) InputSourceReader(org.apache.druid.data.input.InputSourceReader) TaskLock(org.apache.druid.indexing.common.TaskLock) TypeReference(com.fasterxml.jackson.core.type.TypeReference) Execs(org.apache.druid.java.util.common.concurrent.Execs) InputRowSchemas(org.apache.druid.indexing.input.InputRowSchemas) Predicate(java.util.function.Predicate) GuardedBy(com.google.errorprone.annotations.concurrent.GuardedBy) InputFormat(org.apache.druid.data.input.InputFormat) Collection(java.util.Collection) Segments(org.apache.druid.indexing.overlord.Segments) StringUtils(org.apache.druid.java.util.common.StringUtils) Set(java.util.Set) ISE(org.apache.druid.java.util.common.ISE) IngestionSpec(org.apache.druid.segment.indexing.IngestionSpec) Collectors(java.util.stream.Collectors) LockGranularity(org.apache.druid.indexing.common.LockGranularity) MaxAllowedLocksExceededException(org.apache.druid.indexing.common.task.batch.MaxAllowedLocksExceededException) IngestSegmentFirehoseFactory(org.apache.druid.indexing.firehose.IngestSegmentFirehoseFactory) InputRow(org.apache.druid.data.input.InputRow) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) TaskLockType(org.apache.druid.indexing.common.TaskLockType) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) DataSegment(org.apache.druid.timeline.DataSegment) TransformSpec(org.apache.druid.segment.transform.TransformSpec) Logger(org.apache.druid.java.util.common.logger.Logger) Granularity(org.apache.druid.java.util.common.granularity.Granularity) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) IntervalsByGranularity(org.apache.druid.java.util.common.granularity.IntervalsByGranularity) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) TimeChunkLockTryAcquireAction(org.apache.druid.indexing.common.actions.TimeChunkLockTryAcquireAction) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Function(java.util.function.Function) TaskStatus(org.apache.druid.indexer.TaskStatus) TuningConfig(org.apache.druid.segment.indexing.TuningConfig) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) InputSource(org.apache.druid.data.input.InputSource) Interval(org.joda.time.Interval) ImmutableList(com.google.common.collect.ImmutableList) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) BiConsumer(java.util.function.BiConsumer) SegmentHandoffNotifier(org.apache.druid.segment.handoff.SegmentHandoffNotifier) Nullable(javax.annotation.Nullable) ExecutorService(java.util.concurrent.ExecutorService) ClientCompactionTaskTransformSpec(org.apache.druid.client.indexing.ClientCompactionTaskTransformSpec) Period(org.joda.time.Period) FirehoseFactory(org.apache.druid.data.input.FirehoseFactory) VersionedIntervalTimeline(org.apache.druid.timeline.VersionedIntervalTimeline) Iterator(java.util.Iterator) ServiceMetricEvent(org.apache.druid.java.util.emitter.service.ServiceMetricEvent) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) IOException(java.io.IOException) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) GranularityType(org.apache.druid.java.util.common.granularity.GranularityType) Preconditions(com.google.common.base.Preconditions) RetrieveUsedSegmentsAction(org.apache.druid.indexing.common.actions.RetrieveUsedSegmentsAction) SegmentDescriptor(org.apache.druid.query.SegmentDescriptor) DataSchema(org.apache.druid.segment.indexing.DataSchema) Collections(java.util.Collections) RetrieveUsedSegmentsAction(org.apache.druid.indexing.common.actions.RetrieveUsedSegmentsAction) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) DataSegment(org.apache.druid.timeline.DataSegment) Interval(org.joda.time.Interval)

Example 4 with WindowedSegmentId

use of org.apache.druid.indexing.firehose.WindowedSegmentId in project druid by druid-io.

the class DruidInputSourceTest method testSerdeUsingSegments.

@Test
public void testSerdeUsingSegments() throws Exception {
    final String json = "{" + "\"type\":\"druid\"," + "\"dataSource\":\"foo\"," + "\"segments\":[" + "{\"segmentId\":\"foo_2000-01-01T00:00:00.000Z_2000-01-01T01:00:00.000Z_abc123\"," + "\"intervals\":[\"2000-01-01T00:00:00.000Z/2000-01-01T12:00:00.000Z\"]}" + "]" + "}";
    final InputSource inputSource = mapper.readValue(json, InputSource.class);
    Assert.assertThat(inputSource, CoreMatchers.instanceOf(DruidInputSource.class));
    Assert.assertEquals(new DruidInputSource("foo", null, ImmutableList.of(new WindowedSegmentId("foo_2000-01-01T00:00:00.000Z_2000-01-01T01:00:00.000Z_abc123", ImmutableList.of(Intervals.of("2000-01-01T00/2000-01-01T12")))), null, null, null, indexIO, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, taskConfig), inputSource);
    Assert.assertEquals(json, mapper.writeValueAsString(inputSource));
}
Also used : InputSource(org.apache.druid.data.input.InputSource) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) Test(org.junit.Test)

Aggregations

WindowedSegmentId (org.apache.druid.indexing.firehose.WindowedSegmentId)4 ArrayList (java.util.ArrayList)3 ISE (org.apache.druid.java.util.common.ISE)3 DataSegment (org.apache.druid.timeline.DataSegment)3 Interval (org.joda.time.Interval)3 Preconditions (com.google.common.base.Preconditions)2 ImmutableList (com.google.common.collect.ImmutableList)2 File (java.io.File)2 Collection (java.util.Collection)2 Collections (java.util.Collections)2 Iterator (java.util.Iterator)2 List (java.util.List)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2 Nullable (javax.annotation.Nullable)2 InputFormat (org.apache.druid.data.input.InputFormat)2 InputSource (org.apache.druid.data.input.InputSource)2 InputSourceReader (org.apache.druid.data.input.InputSourceReader)2 TaskConfig (org.apache.druid.indexing.common.config.TaskConfig)2 Logger (org.apache.druid.java.util.common.logger.Logger)2