use of org.apache.druid.data.input.InputSplit in project druid by druid-io.
the class GoogleCloudStorageInputSource method getPrefixesSplitStream.
@Override
protected Stream<InputSplit<List<CloudObjectLocation>>> getPrefixesSplitStream(@Nonnull SplitHintSpec splitHintSpec) {
final Iterator<List<StorageObject>> splitIterator = splitHintSpec.split(storageObjectIterable().iterator(), storageObject -> {
final BigInteger sizeInBigInteger = storageObject.getSize();
long sizeInLong;
if (sizeInBigInteger == null) {
sizeInLong = Long.MAX_VALUE;
} else {
try {
sizeInLong = sizeInBigInteger.longValueExact();
} catch (ArithmeticException e) {
LOG.warn(e, "The object [%s, %s] has a size [%s] out of the range of the long type. " + "The max long value will be used for its size instead.", storageObject.getBucket(), storageObject.getName(), sizeInBigInteger);
sizeInLong = Long.MAX_VALUE;
}
}
return new InputFileAttribute(sizeInLong);
});
return Streams.sequentialStreamFrom(splitIterator).map(objects -> objects.stream().map(this::byteSourceFromStorageObject).collect(Collectors.toList())).map(InputSplit::new);
}
use of org.apache.druid.data.input.InputSplit in project druid by druid-io.
the class S3InputSourceTest method testCreateSplitsWithEmptyObjectsIteratingOnlyNonEmptyObjects.
@Test
public void testCreateSplitsWithEmptyObjectsIteratingOnlyNonEmptyObjects() {
EasyMock.reset(S3_CLIENT);
expectListObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)), CONTENT);
expectListObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1)), new byte[0]);
EasyMock.replay(S3_CLIENT);
S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, null, PREFIXES, null, null);
Stream<InputSplit<List<CloudObjectLocation>>> splits = inputSource.createSplits(new JsonInputFormat(JSONPathSpec.DEFAULT, null, null), null);
Assert.assertEquals(ImmutableList.of(ImmutableList.of(new CloudObjectLocation(EXPECTED_URIS.get(0)))), splits.map(InputSplit::get).collect(Collectors.toList()));
EasyMock.verify(S3_CLIENT);
}
use of org.apache.druid.data.input.InputSplit in project druid by druid-io.
the class S3InputSourceTest method testWithUrisSplit.
@Test
public void testWithUrisSplit() {
S3InputSource inputSource = new S3InputSource(SERVICE, SERVER_SIDE_ENCRYPTING_AMAZON_S3_BUILDER, INPUT_DATA_CONFIG, EXPECTED_URIS, null, null, null);
Stream<InputSplit<List<CloudObjectLocation>>> splits = inputSource.createSplits(new JsonInputFormat(JSONPathSpec.DEFAULT, null, null), null);
Assert.assertEquals(EXPECTED_COORDS, splits.map(InputSplit::get).collect(Collectors.toList()));
}
use of org.apache.druid.data.input.InputSplit in project druid by druid-io.
the class DruidInputSource method createSplits.
public static Iterator<InputSplit<List<WindowedSegmentId>>> createSplits(CoordinatorClient coordinatorClient, RetryPolicyFactory retryPolicyFactory, String dataSource, Interval interval, SplitHintSpec splitHintSpec) {
final SplitHintSpec convertedSplitHintSpec;
if (splitHintSpec instanceof SegmentsSplitHintSpec) {
final SegmentsSplitHintSpec segmentsSplitHintSpec = (SegmentsSplitHintSpec) splitHintSpec;
convertedSplitHintSpec = new MaxSizeSplitHintSpec(segmentsSplitHintSpec.getMaxInputSegmentBytesPerTask(), segmentsSplitHintSpec.getMaxNumSegments());
} else {
convertedSplitHintSpec = splitHintSpec;
}
final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = getTimelineForInterval(coordinatorClient, retryPolicyFactory, dataSource, interval);
final Map<WindowedSegmentId, Long> segmentIdToSize = createWindowedSegmentIdFromTimeline(timelineSegments);
// noinspection ConstantConditions
return Iterators.transform(convertedSplitHintSpec.split(// the same input split.
segmentIdToSize.keySet().iterator(), segmentId -> new InputFileAttribute(Preconditions.checkNotNull(segmentIdToSize.get(segmentId), "segment size for [%s]", segmentId))), InputSplit::new);
}
Aggregations