Search in sources :

Example 16 with InputSplit

use of org.apache.druid.data.input.InputSplit in project druid by druid-io.

the class GoogleCloudStorageInputSource method getPrefixesSplitStream.

@Override
protected Stream<InputSplit<List<CloudObjectLocation>>> getPrefixesSplitStream(@Nonnull SplitHintSpec splitHintSpec) {
    final Iterator<List<StorageObject>> splitIterator = splitHintSpec.split(storageObjectIterable().iterator(), storageObject -> {
        final BigInteger sizeInBigInteger = storageObject.getSize();
        long sizeInLong;
        if (sizeInBigInteger == null) {
            sizeInLong = Long.MAX_VALUE;
        } else {
            try {
                sizeInLong = sizeInBigInteger.longValueExact();
            } catch (ArithmeticException e) {
                LOG.warn(e, "The object [%s, %s] has a size [%s] out of the range of the long type. " + "The max long value will be used for its size instead.", storageObject.getBucket(), storageObject.getName(), sizeInBigInteger);
                sizeInLong = Long.MAX_VALUE;
            }
        }
        return new InputFileAttribute(sizeInLong);
    });
    return Streams.sequentialStreamFrom(splitIterator).map(objects -> objects.stream().map(this::byteSourceFromStorageObject).collect(Collectors.toList())).map(InputSplit::new);
}
Also used : Logger(org.apache.druid.java.util.common.logger.Logger) Streams(org.apache.druid.utils.Streams) JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) GoogleStorageDruidModule(org.apache.druid.storage.google.GoogleStorageDruidModule) GoogleUtils(org.apache.druid.storage.google.GoogleUtils) InputSplit(org.apache.druid.data.input.InputSplit) CloudObjectInputSource(org.apache.druid.data.input.impl.CloudObjectInputSource) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) GoogleInputDataConfig(org.apache.druid.storage.google.GoogleInputDataConfig) BigInteger(java.math.BigInteger) URI(java.net.URI) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) StorageObject(com.google.api.services.storage.model.StorageObject) JacksonInject(com.fasterxml.jackson.annotation.JacksonInject) GoogleStorage(org.apache.druid.storage.google.GoogleStorage) Iterator(java.util.Iterator) SplitHintSpec(org.apache.druid.data.input.SplitHintSpec) SplittableInputSource(org.apache.druid.data.input.impl.SplittableInputSource) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) CloudObjectLocation(org.apache.druid.data.input.impl.CloudObjectLocation) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) InputEntity(org.apache.druid.data.input.InputEntity) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) BigInteger(java.math.BigInteger) List(java.util.List) InputSplit(org.apache.druid.data.input.InputSplit)

Example 17 with InputSplit

use of org.apache.druid.data.input.InputSplit in project druid by druid-io.

the class S3InputSourceTest method testCreateSplitsWithEmptyObjectsIteratingOnlyNonEmptyObjects.

@Test
public void testCreateSplitsWithEmptyObjectsIteratingOnlyNonEmptyObjects() {
    EasyMock.reset(S3_CLIENT);
    expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
    expectListObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1)), new byte[0]);
    EasyMock.replay(S3_CLIENT);
    S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, null, PREFIXES, null, null);
    Stream<InputSplit<List<CloudObjectLocation>>> splits = inputSource.createSplits(new JsonInputFormat(JSONPathSpec.DEFAULT, null, null), null);
    Assert.assertEquals(ImmutableList.of(ImmutableList.of(new CloudObjectLocation(EXPECTED_URIS.get(0)))), splits.map(InputSplit::get).collect(Collectors.toList()));
    EasyMock.verify(S3_CLIENT);
}
Also used : JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) CloudObjectLocation(org.apache.druid.data.input.impl.CloudObjectLocation) InputSplit(org.apache.druid.data.input.InputSplit) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 18 with InputSplit

use of org.apache.druid.data.input.InputSplit in project druid by druid-io.

the class S3InputSourceTest method testWithUrisSplit.

@Test
public void testWithUrisSplit() {
    S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, EXPECTED_URIS, null, null, null);
    Stream<InputSplit<List<CloudObjectLocation>>> splits = inputSource.createSplits(new JsonInputFormat(JSONPathSpec.DEFAULT, null, null), null);
    Assert.assertEquals(EXPECTED_COORDS, splits.map(InputSplit::get).collect(Collectors.toList()));
}
Also used : JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) CloudObjectLocation(org.apache.druid.data.input.impl.CloudObjectLocation) InputSplit(org.apache.druid.data.input.InputSplit) InitializedNullHandlingTest(org.apache.druid.testing.InitializedNullHandlingTest) Test(org.junit.Test)

Example 19 with InputSplit

use of org.apache.druid.data.input.InputSplit in project druid by druid-io.

the class DruidInputSource method createSplits.

public static Iterator<InputSplit<List<WindowedSegmentId>>> createSplits(CoordinatorClient coordinatorClient, RetryPolicyFactory retryPolicyFactory, String dataSource, Interval interval, SplitHintSpec splitHintSpec) {
    final SplitHintSpec convertedSplitHintSpec;
    if (splitHintSpec instanceof SegmentsSplitHintSpec) {
        final SegmentsSplitHintSpec segmentsSplitHintSpec = (SegmentsSplitHintSpec) splitHintSpec;
        convertedSplitHintSpec = new MaxSizeSplitHintSpec(segmentsSplitHintSpec.getMaxInputSegmentBytesPerTask(), segmentsSplitHintSpec.getMaxNumSegments());
    } else {
        convertedSplitHintSpec = splitHintSpec;
    }
    final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = getTimelineForInterval(coordinatorClient, retryPolicyFactory, dataSource, interval);
    final Map<WindowedSegmentId, Long> segmentIdToSize = createWindowedSegmentIdFromTimeline(timelineSegments);
    // noinspection ConstantConditions
    return Iterators.transform(convertedSplitHintSpec.split(// the same input split.
    segmentIdToSize.keySet().iterator(), segmentId -> new InputFileAttribute(Preconditions.checkNotNull(segmentIdToSize.get(segmentId), "segment size for [%s]", segmentId))), InputSplit::new);
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) SegmentCacheManagerFactory(org.apache.druid.indexing.common.SegmentCacheManagerFactory) TaskConfig(org.apache.druid.indexing.common.config.TaskConfig) Comparators(org.apache.druid.java.util.common.guava.Comparators) AbstractInputSource(org.apache.druid.data.input.AbstractInputSource) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) FluentIterable(com.google.common.collect.FluentIterable) Map(java.util.Map) InputSourceReader(org.apache.druid.data.input.InputSourceReader) IAE(org.apache.druid.java.util.common.IAE) JacksonInject(com.fasterxml.jackson.annotation.JacksonInject) RetryPolicyFactory(org.apache.druid.indexing.common.RetryPolicyFactory) InputFormat(org.apache.druid.data.input.InputFormat) Collection(java.util.Collection) SplitHintSpec(org.apache.druid.data.input.SplitHintSpec) SplittableInputSource(org.apache.druid.data.input.impl.SplittableInputSource) ISE(org.apache.druid.java.util.common.ISE) Objects(java.util.Objects) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec) PartitionHolder(org.apache.druid.timeline.partition.PartitionHolder) List(java.util.List) Stream(java.util.stream.Stream) DimFilter(org.apache.druid.query.filter.DimFilter) DataSegment(org.apache.druid.timeline.DataSegment) SortedMap(java.util.SortedMap) Logger(org.apache.druid.java.util.common.logger.Logger) Streams(org.apache.druid.utils.Streams) InputSplit(org.apache.druid.data.input.InputSplit) Duration(org.joda.time.Duration) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) HashMap(java.util.HashMap) InputRowSchema(org.apache.druid.data.input.InputRowSchema) Iterators(com.google.common.collect.Iterators) ArrayList(java.util.ArrayList) PartitionChunk(org.apache.druid.timeline.partition.PartitionChunk) Interval(org.joda.time.Interval) ColumnHolder(org.apache.druid.segment.column.ColumnHolder) ImmutableList(com.google.common.collect.ImmutableList) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) CoordinatorClient(org.apache.druid.client.coordinator.CoordinatorClient) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) Nullable(javax.annotation.Nullable) RetryPolicy(org.apache.druid.indexing.common.RetryPolicy) VersionedIntervalTimeline(org.apache.druid.timeline.VersionedIntervalTimeline) Iterator(java.util.Iterator) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) File(java.io.File) InputEntityIteratingReader(org.apache.druid.data.input.impl.InputEntityIteratingReader) TreeMap(java.util.TreeMap) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) JsonInclude(com.fasterxml.jackson.annotation.JsonInclude) Preconditions(com.google.common.base.Preconditions) SegmentCacheManager(org.apache.druid.segment.loading.SegmentCacheManager) Comparator(java.util.Comparator) IndexIO(org.apache.druid.segment.IndexIO) Collections(java.util.Collections) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) InputFileAttribute(org.apache.druid.data.input.InputFileAttribute) WindowedSegmentId(org.apache.druid.indexing.firehose.WindowedSegmentId) InputSplit(org.apache.druid.data.input.InputSplit) SplitHintSpec(org.apache.druid.data.input.SplitHintSpec) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec) SegmentsSplitHintSpec(org.apache.druid.data.input.SegmentsSplitHintSpec) MaxSizeSplitHintSpec(org.apache.druid.data.input.MaxSizeSplitHintSpec)

Aggregations

InputSplit (org.apache.druid.data.input.InputSplit)19 Test (org.junit.Test)15 CloudObjectLocation (org.apache.druid.data.input.impl.CloudObjectLocation)13 JsonInputFormat (org.apache.druid.data.input.impl.JsonInputFormat)11 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)11 MaxSizeSplitHintSpec (org.apache.druid.data.input.MaxSizeSplitHintSpec)10 List (java.util.List)5 ImmutableList (com.google.common.collect.ImmutableList)4 HumanReadableBytes (org.apache.druid.java.util.common.HumanReadableBytes)4 File (java.io.File)3 URI (java.net.URI)3 InputFileAttribute (org.apache.druid.data.input.InputFileAttribute)3 JacksonInject (com.fasterxml.jackson.annotation.JacksonInject)2 JsonCreator (com.fasterxml.jackson.annotation.JsonCreator)2 JsonProperty (com.fasterxml.jackson.annotation.JsonProperty)2 ArrayList (java.util.ArrayList)2 Iterator (java.util.Iterator)2 Stream (java.util.stream.Stream)2 Nullable (javax.annotation.Nullable)2 SplitHintSpec (org.apache.druid.data.input.SplitHintSpec)2