use of org.apache.druid.data.input.InputFileAttribute in project druid by druid-io.
the class HdfsInputSource method createSplits.
@Override
public Stream<InputSplit<List<Path>>> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec) throws IOException {
cachePathsIfNeeded();
final Iterator<List<Path>> splitIterator = getSplitHintSpecOrDefault(splitHintSpec).split(cachedPaths.iterator(), path -> {
try {
final long size = path.getFileSystem(configuration).getFileStatus(path).getLen();
return new InputFileAttribute(size);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
});
return Streams.sequentialStreamFrom(splitIterator).map(InputSplit::new);
}
use of org.apache.druid.data.input.InputFileAttribute in project druid by druid-io.
the class GoogleCloudStorageInputSource method getPrefixesSplitStream.
@Override
protected Stream<InputSplit<List<CloudObjectLocation>>> getPrefixesSplitStream(@Nonnull SplitHintSpec splitHintSpec) {
final Iterator<List<StorageObject>> splitIterator = splitHintSpec.split(storageObjectIterable().iterator(), storageObject -> {
final BigInteger sizeInBigInteger = storageObject.getSize();
long sizeInLong;
if (sizeInBigInteger == null) {
sizeInLong = Long.MAX_VALUE;
} else {
try {
sizeInLong = sizeInBigInteger.longValueExact();
} catch (ArithmeticException e) {
LOG.warn(e, "The object [%s, %s] has a size [%s] out of the range of the long type. " + "The max long value will be used for its size instead.", storageObject.getBucket(), storageObject.getName(), sizeInBigInteger);
sizeInLong = Long.MAX_VALUE;
}
}
return new InputFileAttribute(sizeInLong);
});
return Streams.sequentialStreamFrom(splitIterator).map(objects -> objects.stream().map(this::byteSourceFromStorageObject).collect(Collectors.toList())).map(InputSplit::new);
}
use of org.apache.druid.data.input.InputFileAttribute in project druid by druid-io.
the class DruidInputSource method createSplits.
public static Iterator<InputSplit<List<WindowedSegmentId>>> createSplits(CoordinatorClient coordinatorClient, RetryPolicyFactory retryPolicyFactory, String dataSource, Interval interval, SplitHintSpec splitHintSpec) {
final SplitHintSpec convertedSplitHintSpec;
if (splitHintSpec instanceof SegmentsSplitHintSpec) {
final SegmentsSplitHintSpec segmentsSplitHintSpec = (SegmentsSplitHintSpec) splitHintSpec;
convertedSplitHintSpec = new MaxSizeSplitHintSpec(segmentsSplitHintSpec.getMaxInputSegmentBytesPerTask(), segmentsSplitHintSpec.getMaxNumSegments());
} else {
convertedSplitHintSpec = splitHintSpec;
}
final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = getTimelineForInterval(coordinatorClient, retryPolicyFactory, dataSource, interval);
final Map<WindowedSegmentId, Long> segmentIdToSize = createWindowedSegmentIdFromTimeline(timelineSegments);
// noinspection ConstantConditions
return Iterators.transform(convertedSplitHintSpec.split(// the same input split.
segmentIdToSize.keySet().iterator(), segmentId -> new InputFileAttribute(Preconditions.checkNotNull(segmentIdToSize.get(segmentId), "segment size for [%s]", segmentId))), InputSplit::new);
}
Aggregations