Search in sources :

Example 1 with SplitHintSpec

use of org.apache.druid.data.input.SplitHintSpec in project druid by druid-io.

the class GoogleCloudStorageInputSource method getPrefixesSplitStream.

@Override
protected Stream<InputSplit<List<CloudObjectLocation>>> getPrefixesSplitStream(@Nonnull SplitHintSpec splitHintSpec) {
    final Iterator<List<StorageObject>> splitIterator = splitHintSpec.split(storageObjectIterable().iterator(), storageObject -> {
        final BigInteger sizeInBigInteger = storageObject.getSize();
        long sizeInLong;
        if (sizeInBigInteger == null) {
            sizeInLong = Long.MAX_VALUE;
        } else {
            try {
                sizeInLong = sizeInBigInteger.longValueExact();
            } catch (ArithmeticException e) {
                LOG.warn(e, "The object [%s, %s] has a size [%s] out of the range of the long type. " + "The max long value will be used for its size instead.", storageObject.getBucket(), storageObject.getName(), sizeInBigInteger);
                sizeInLong = Long.MAX_VALUE;
            }
        }
        return new InputFileAttribute(sizeInLong);
    });
    return Streams.sequentialStreamFrom(splitIterator).map(objects -> objects.stream().map(this::byteSourceFromStorageObject).collect(Collectors.toList())).map(InputSplit::new);
}
Also used : Logger(org.apache.druid.java.util.common.logger.Logger) Streams(org.apache.druid.utils.Streams) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) GoogleStorageDruidModule(org.apache.druid.storage.google.GoogleStorageDruidModule) GoogleUtils(org.apache.druid.storage.google.GoogleUtils) InputSplit(org.apache.druid.data.input.InputSplit) CloudObjectInputSource(org.apache.druid.data.input.impl.CloudObjectInputSource) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) GoogleInputDataConfig(org.apache.druid.storage.google.GoogleInputDataConfig) BigInteger(java.math.BigInteger) URI(java.net.URI) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) StorageObject(com.google.api.services.storage.model.StorageObject) JacksonInject(com.fasterxml.jackson.annotation.JacksonInject) GoogleStorage(org.apache.druid.storage.google.GoogleStorage) Iterator(java.util.Iterator) SplitHintSpec(org.apache.druid.data.input.SplitHintSpec) SplittableInputSource(org.apache.druid.data.input.impl.SplittableInputSource) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) CloudObjectLocation(org.apache.druid.data.input.impl.CloudObjectLocation) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) InputEntity(org.apache.druid.data.input.InputEntity) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) BigInteger(java.math.BigInteger) List(java.util.List) InputSplit(org.apache.druid.data.input.InputSplit)

Example 2 with SplitHintSpec

use of org.apache.druid.data.input.SplitHintSpec in project druid by druid-io.

the class DruidInputSource method createSplits.

public static Iterator<InputSplit<List<WindowedSegmentId>>> createSplits(CoordinatorClient coordinatorClient, RetryPolicyFactory retryPolicyFactory, String dataSource, Interval interval, SplitHintSpec splitHintSpec) {
    final SplitHintSpec convertedSplitHintSpec;
    if (splitHintSpec instanceof SegmentsSplitHintSpec) {
        final SegmentsSplitHintSpec segmentsSplitHintSpec = (SegmentsSplitHintSpec) splitHintSpec;
        convertedSplitHintSpec = new MaxSizeSplitHintSpec(segmentsSplitHintSpec.getMaxInputSegmentBytesPerTask(), segmentsSplitHintSpec.getMaxNumSegments());
    } else {
        convertedSplitHintSpec = splitHintSpec;
    }
    final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = getTimelineForInterval(coordinatorClient, retryPolicyFactory, dataSource, interval);
    final Map<WindowedSegmentId, Long> segmentIdToSize = createWindowedSegmentIdFromTimeline(timelineSegments);
    // noinspection ConstantConditions
    return Iterators.transform(convertedSplitHintSpec.split(// the same input split.
    segmentIdToSize.keySet().iterator(), segmentId -> new InputFileAttribute(Preconditions.checkNotNull(segmentIdToSize.get(segmentId), "segment size for [%s]", segmentId))), InputSplit::new);
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) SegmentCacheManagerFactory(org.apache.druid.indexing.common.SegmentCacheManagerFactory) TaskConfig(org.apache.druid.indexing.common.config.TaskConfig) Comparators(org.apache.druid.java.util.common.guava.Comparators) AbstractInputSource(org.apache.druid.data.input.AbstractInputSource) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) FluentIterable(com.google.common.collect.FluentIterable) Map(java.util.Map) InputSourceReader(org.apache.druid.data.input.InputSourceReader) IAE(org.apache.druid.java.util.common.IAE) JacksonInject(com.fasterxml.jackson.annotation.JacksonInject) RetryPolicyFactory(org.apache.druid.indexing.common.RetryPolicyFactory) InputFormat(org.apache.druid.data.input.InputFormat) Collection(java.util.Collection) SplitHintSpec(org.apache.druid.data.input.SplitHintSpec) SplittableInputSource(org.apache.druid.data.input.impl.SplittableInputSource) ISE(org.apache.druid.java.util.common.ISE) Objects(java.util.Objects) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec) PartitionHolder(org.apache.druid.timeline.partition.PartitionHolder) List(java.util.List) Stream(java.util.stream.Stream) DimFilter(org.apache.druid.query.filter.DimFilter) DataSegment(org.apache.druid.timeline.DataSegment) SortedMap(java.util.SortedMap) Logger(org.apache.druid.java.util.common.logger.Logger) Streams(org.apache.druid.utils.Streams) InputSplit(org.apache.druid.data.input.InputSplit) Duration(org.joda.time.Duration) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) HashMap(java.util.HashMap) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Iterators(com.google.common.collect.Iterators) ArrayList(java.util.ArrayList) PartitionChunk(org.apache.druid.timeline.partition.PartitionChunk) Interval(org.joda.time.Interval) ColumnHolder(org.apache.druid.segment.column.ColumnHolder) ImmutableList(com.google.common.collect.ImmutableList) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) CoordinatorClient(org.apache.druid.client.coordinator.CoordinatorClient) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) Nullable(javax.annotation.Nullable) RetryPolicy(org.apache.druid.indexing.common.RetryPolicy) VersionedIntervalTimeline(org.apache.druid.timeline.VersionedIntervalTimeline) Iterator(java.util.Iterator) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) File(java.io.File) InputEntityIteratingReader(org.apache.druid.data.input.impl.InputEntityIteratingReader) TreeMap(java.util.TreeMap) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) JsonInclude(com.fasterxml.jackson.annotation.JsonInclude) Preconditions(com.google.common.base.Preconditions) SegmentCacheManager(org.apache.druid.segment.loading.SegmentCacheManager) Comparator(java.util.Comparator) IndexIO(org.apache.druid.segment.IndexIO) Collections(java.util.Collections) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) InputSplit(org.apache.druid.data.input.InputSplit) SplitHintSpec(org.apache.druid.data.input.SplitHintSpec) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec)

Aggregations

JacksonInject (com.fasterxml.jackson.annotation.JacksonInject)2 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)2 JsonProperty (com.fasterxml.jackson.annotation.JsonProperty)2 Iterator (java.util.Iterator)2 List (java.util.List)2 Stream (java.util.stream.Stream)2 Nullable (javax.annotation.Nullable)2 InputFileAttribute (org.apache.druid.data.input.InputFileAttribute)2 InputSplit (org.apache.druid.data.input.InputSplit)2 SplitHintSpec (org.apache.druid.data.input.SplitHintSpec)2 SplittableInputSource (org.apache.druid.data.input.impl.SplittableInputSource)2 Logger (org.apache.druid.java.util.common.logger.Logger)2 Streams (org.apache.druid.utils.Streams)2 JsonInclude (com.fasterxml.jackson.annotation.JsonInclude)1 StorageObject (com.google.api.services.storage.model.StorageObject)1 Preconditions (com.google.common.base.Preconditions)1 FluentIterable (com.google.common.collect.FluentIterable)1 ImmutableList (com.google.common.collect.ImmutableList)1 Iterators (com.google.common.collect.Iterators)1 File (java.io.File)1