use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class IngestSegmentFirehoseFactory method connect.
@Override
public Firehose connect(InputRowParser inputRowParser, File temporaryDirectory) throws ParseException {
log.debug("Connecting firehose: dataSource[%s], interval[%s], segmentIds[%s]", dataSource, interval, segmentIds);
final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = getTimeline();
// Download all segments locally.
// Note: this requires enough local storage space to fit all of the segments, even though
// IngestSegmentFirehose iterates over the segments in series. We may want to change this
// to download files lazily, perhaps sharing code with PrefetchableTextFilesFirehoseFactory.
final SegmentCacheManager segmentCacheManager = segmentCacheManagerFactory.manufacturate(temporaryDirectory);
Map<DataSegment, File> segmentFileMap = Maps.newLinkedHashMap();
for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) {
for (PartitionChunk<DataSegment> chunk : holder.getObject()) {
final DataSegment segment = chunk.getObject();
segmentFileMap.computeIfAbsent(segment, k -> {
try {
return segmentCacheManager.getSegmentFiles(segment);
} catch (SegmentLoadingException e) {
throw new RuntimeException(e);
}
});
}
}
final List<String> dims = ReingestionTimelineUtils.getDimensionsToReingest(dimensions, inputRowParser.getParseSpec().getDimensionsSpec(), timeLineSegments);
final List<String> metricsList = metrics == null ? ReingestionTimelineUtils.getUniqueMetrics(timeLineSegments) : metrics;
final List<WindowedStorageAdapter> adapters = Lists.newArrayList(Iterables.concat(Iterables.transform(timeLineSegments, new Function<TimelineObjectHolder<String, DataSegment>, Iterable<WindowedStorageAdapter>>() {
@Override
public Iterable<WindowedStorageAdapter> apply(final TimelineObjectHolder<String, DataSegment> holder) {
return Iterables.transform(holder.getObject(), new Function<PartitionChunk<DataSegment>, WindowedStorageAdapter>() {
@Override
public WindowedStorageAdapter apply(final PartitionChunk<DataSegment> input) {
final DataSegment segment = input.getObject();
try {
return new WindowedStorageAdapter(new QueryableIndexStorageAdapter(indexIO.loadIndex(Preconditions.checkNotNull(segmentFileMap.get(segment), "File for segment %s", segment.getId()))), holder.getInterval());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
}
})));
final TransformSpec transformSpec = TransformSpec.fromInputRowParser(inputRowParser);
return new IngestSegmentFirehose(adapters, transformSpec, dims, metricsList, dimFilter);
}
use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class DruidInputSource method createSplits.
public static Iterator<InputSplit<List<WindowedSegmentId>>> createSplits(CoordinatorClient coordinatorClient, RetryPolicyFactory retryPolicyFactory, String dataSource, Interval interval, SplitHintSpec splitHintSpec) {
final SplitHintSpec convertedSplitHintSpec;
if (splitHintSpec instanceof SegmentsSplitHintSpec) {
final SegmentsSplitHintSpec segmentsSplitHintSpec = (SegmentsSplitHintSpec) splitHintSpec;
convertedSplitHintSpec = new MaxSizeSplitHintSpec(segmentsSplitHintSpec.getMaxInputSegmentBytesPerTask(), segmentsSplitHintSpec.getMaxNumSegments());
} else {
convertedSplitHintSpec = splitHintSpec;
}
final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = getTimelineForInterval(coordinatorClient, retryPolicyFactory, dataSource, interval);
final Map<WindowedSegmentId, Long> segmentIdToSize = createWindowedSegmentIdFromTimeline(timelineSegments);
// noinspection ConstantConditions
return Iterators.transform(convertedSplitHintSpec.split(// the same input split.
segmentIdToSize.keySet().iterator(), segmentId -> new InputFileAttribute(Preconditions.checkNotNull(segmentIdToSize.get(segmentId), "segment size for [%s]", segmentId))), InputSplit::new);
}
use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class DruidInputSource method fixedFormatReader.
@Override
protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nullable File temporaryDirectory) {
final SegmentCacheManager segmentCacheManager = segmentCacheManagerFactory.manufacturate(temporaryDirectory);
final List<TimelineObjectHolder<String, DataSegment>> timeline = createTimeline();
final Iterator<DruidSegmentInputEntity> entityIterator = FluentIterable.from(timeline).transformAndConcat(holder -> {
// noinspection ConstantConditions
final PartitionHolder<DataSegment> partitionHolder = holder.getObject();
// noinspection ConstantConditions
return FluentIterable.from(partitionHolder).transform(chunk -> new DruidSegmentInputEntity(segmentCacheManager, chunk.getObject(), holder.getInterval()));
}).iterator();
final DruidSegmentInputFormat inputFormat = new DruidSegmentInputFormat(indexIO, dimFilter);
final InputRowSchema inputRowSchemaToUse;
if (taskConfig.isIgnoreTimestampSpecForDruidInputSource()) {
// Legacy compatibility mode; see https://github.com/apache/druid/pull/10267.
LOG.warn("Ignoring the provided timestampSpec and reading the __time column instead. To use timestampSpecs with " + "the 'druid' input source, set druid.indexer.task.ignoreTimestampSpecForDruidInputSource to false.");
inputRowSchemaToUse = new InputRowSchema(new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, STANDARD_TIME_COLUMN_FORMATS.iterator().next(), null), inputRowSchema.getDimensionsSpec(), inputRowSchema.getColumnsFilter().plus(ColumnHolder.TIME_COLUMN_NAME));
} else {
inputRowSchemaToUse = inputRowSchema;
}
if (ColumnHolder.TIME_COLUMN_NAME.equals(inputRowSchemaToUse.getTimestampSpec().getTimestampColumn()) && !STANDARD_TIME_COLUMN_FORMATS.contains(inputRowSchemaToUse.getTimestampSpec().getTimestampFormat())) {
// Slight chance the user did this intentionally, but not likely. Log a warning.
LOG.warn("The provided timestampSpec refers to the %s column without using format %s. If you wanted to read the " + "column as-is, switch formats.", inputRowSchemaToUse.getTimestampSpec().getTimestampColumn(), STANDARD_TIME_COLUMN_FORMATS);
}
return new InputEntityIteratingReader(inputRowSchemaToUse, inputFormat, entityIterator, temporaryDirectory);
}
use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinator method createNewSegment.
/**
* This function creates a new segment for the given datasource/interval/etc. A critical
* aspect of the creation is to make sure that the new version & new partition number will make
* sense given the existing segments & pending segments also very important is to avoid
* clashes with existing pending & used/unused segments.
* @param handle Database handle
* @param dataSource datasource for the new segment
* @param interval interval for the new segment
* @param partialShardSpec Shard spec info minus segment id stuff
* @param existingVersion Version of segments in interval, used to compute the version of the very first segment in
* interval
* @return
* @throws IOException
*/
@Nullable
private SegmentIdWithShardSpec createNewSegment(final Handle handle, final String dataSource, final Interval interval, final PartialShardSpec partialShardSpec, final String existingVersion) throws IOException {
// Get the time chunk and associated data segments for the given interval, if any
final List<TimelineObjectHolder<String, DataSegment>> existingChunks = getTimelineForIntervalsWithHandle(handle, dataSource, ImmutableList.of(interval)).lookup(interval);
if (existingChunks.size() > 1) {
// Not possible to expand more than one chunk with a single segment.
log.warn("Cannot allocate new segment for dataSource[%s], interval[%s]: already have [%,d] chunks.", dataSource, interval, existingChunks.size());
return null;
} else {
// max partitionId of the shardSpecs which share the same partition space.
SegmentIdWithShardSpec maxId = null;
if (!existingChunks.isEmpty()) {
TimelineObjectHolder<String, DataSegment> existingHolder = Iterables.getOnlyElement(existingChunks);
// noinspection ConstantConditions
for (DataSegment segment : FluentIterable.from(existingHolder.getObject()).transform(PartitionChunk::getObject).filter(segment -> segment.getShardSpec().sharePartitionSpace(partialShardSpec))) {
// Note that this will compute the max id of existing, visible, data segments in the time chunk:
if (maxId == null || maxId.getShardSpec().getPartitionNum() < segment.getShardSpec().getPartitionNum()) {
maxId = SegmentIdWithShardSpec.fromDataSegment(segment);
}
}
}
// Get the version of the existing chunk, we might need it in some of the cases below
// to compute the new identifier's version
@Nullable final String versionOfExistingChunk;
if (!existingChunks.isEmpty()) {
// remember only one chunk possible for given interval so get the first & only one
versionOfExistingChunk = existingChunks.get(0).getVersion();
} else {
versionOfExistingChunk = null;
}
// next, we need to enrich the maxId computed before with the information of the pending segments
// it is possible that a pending segment has a higher id in which case we need that, it will work,
// and it will avoid clashes when inserting the new pending segment later in the caller of this method
final Set<SegmentIdWithShardSpec> pendings = getPendingSegmentsForIntervalWithHandle(handle, dataSource, interval);
// Make sure we add the maxId we obtained from the segments table:
if (maxId != null) {
pendings.add(maxId);
}
// Now compute the maxId with all the information: pendings + segments:
// The versionOfExistingChunks filter is ensure that we pick the max id with the version of the existing chunk
// in the case that there may be a pending segment with a higher version but no corresponding used segments
// which may generate a clash with an existing segment once the new id is generated
maxId = pendings.stream().filter(id -> id.getShardSpec().sharePartitionSpace(partialShardSpec)).filter(id -> versionOfExistingChunk == null ? true : id.getVersion().equals(versionOfExistingChunk)).max((id1, id2) -> {
final int versionCompare = id1.getVersion().compareTo(id2.getVersion());
if (versionCompare != 0) {
return versionCompare;
} else {
return Integer.compare(id1.getShardSpec().getPartitionNum(), id2.getShardSpec().getPartitionNum());
}
}).orElse(null);
// The following code attempts to compute the new version, if this
// new version is not null at the end of next block then it will be
// used as the new version in the case for initial or appended segment
final String newSegmentVersion;
if (versionOfExistingChunk != null) {
// segment version overrides, so pick that now that we know it exists
newSegmentVersion = versionOfExistingChunk;
} else if (!pendings.isEmpty() && maxId != null) {
// there is no visible segments in the time chunk, so pick the maxId of pendings, as computed above
newSegmentVersion = maxId.getVersion();
} else {
// no segments, no pendings, so this must be the very first segment created for this interval
newSegmentVersion = null;
}
if (maxId == null) {
// When appending segments, null maxId means that we are allocating the very initial
// segment for this time chunk.
// This code is executed when the Overlord coordinates segment allocation, which is either you append segments
// or you use segment lock. Since the core partitions set is not determined for appended segments, we set
// it 0. When you use segment lock, the core partitions set doesn't work with it. We simply set it 0 so that the
// OvershadowableManager handles the atomic segment update.
final int newPartitionId = partialShardSpec.useNonRootGenerationPartitionSpace() ? PartitionIds.NON_ROOT_GEN_START_PARTITION_ID : PartitionIds.ROOT_GEN_START_PARTITION_ID;
String version = newSegmentVersion == null ? existingVersion : newSegmentVersion;
return new SegmentIdWithShardSpec(dataSource, interval, version, partialShardSpec.complete(jsonMapper, newPartitionId, 0));
} else if (!maxId.getInterval().equals(interval) || maxId.getVersion().compareTo(existingVersion) > 0) {
log.warn("Cannot allocate new segment for dataSource[%s], interval[%s], existingVersion[%s]: conflicting segment[%s].", dataSource, interval, existingVersion, maxId);
return null;
} else if (maxId.getShardSpec().getNumCorePartitions() == SingleDimensionShardSpec.UNKNOWN_NUM_CORE_PARTITIONS) {
log.warn("Cannot allocate new segment because of unknown core partition size of segment[%s], shardSpec[%s]", maxId, maxId.getShardSpec());
return null;
} else {
return new SegmentIdWithShardSpec(dataSource, maxId.getInterval(), Preconditions.checkNotNull(newSegmentVersion, "newSegmentVersion"), partialShardSpec.complete(jsonMapper, maxId.getShardSpec().getPartitionNum() + 1, maxId.getShardSpec().getNumCorePartitions()));
}
}
}
use of org.apache.druid.timeline.TimelineObjectHolder in project druid by druid-io.
the class IndexerSQLMetadataStorageCoordinator method announceHistoricalSegments.
@Override
public SegmentPublishResult announceHistoricalSegments(final Set<DataSegment> segments, final Set<DataSegment> segmentsToDrop, @Nullable final DataSourceMetadata startMetadata, @Nullable final DataSourceMetadata endMetadata) throws IOException {
if (segments.isEmpty()) {
throw new IllegalArgumentException("segment set must not be empty");
}
final String dataSource = segments.iterator().next().getDataSource();
for (DataSegment segment : segments) {
if (!dataSource.equals(segment.getDataSource())) {
throw new IllegalArgumentException("segments must all be from the same dataSource");
}
}
if ((startMetadata == null && endMetadata != null) || (startMetadata != null && endMetadata == null)) {
throw new IllegalArgumentException("start/end metadata pair must be either null or non-null");
}
// Find which segments are used (i.e. not overshadowed).
final Set<DataSegment> usedSegments = new HashSet<>();
List<TimelineObjectHolder<String, DataSegment>> segmentHolders = VersionedIntervalTimeline.forSegments(segments).lookupWithIncompletePartitions(Intervals.ETERNITY);
for (TimelineObjectHolder<String, DataSegment> holder : segmentHolders) {
for (PartitionChunk<DataSegment> chunk : holder.getObject()) {
usedSegments.add(chunk.getObject());
}
}
final AtomicBoolean definitelyNotUpdated = new AtomicBoolean(false);
try {
return connector.retryTransaction(new TransactionCallback<SegmentPublishResult>() {
@Override
public SegmentPublishResult inTransaction(final Handle handle, final TransactionStatus transactionStatus) throws Exception {
// Set definitelyNotUpdated back to false upon retrying.
definitelyNotUpdated.set(false);
if (startMetadata != null) {
final DataStoreMetadataUpdateResult result = updateDataSourceMetadataWithHandle(handle, dataSource, startMetadata, endMetadata);
if (result != DataStoreMetadataUpdateResult.SUCCESS) {
// Metadata was definitely not updated.
transactionStatus.setRollbackOnly();
definitelyNotUpdated.set(true);
if (result == DataStoreMetadataUpdateResult.FAILURE) {
throw new RuntimeException("Aborting transaction!");
} else if (result == DataStoreMetadataUpdateResult.TRY_AGAIN) {
throw new RetryTransactionException("Aborting transaction!");
}
}
}
if (segmentsToDrop != null && !segmentsToDrop.isEmpty()) {
final DataStoreMetadataUpdateResult result = dropSegmentsWithHandle(handle, segmentsToDrop, dataSource);
if (result != DataStoreMetadataUpdateResult.SUCCESS) {
// Metadata store was definitely not updated.
transactionStatus.setRollbackOnly();
definitelyNotUpdated.set(true);
if (result == DataStoreMetadataUpdateResult.FAILURE) {
throw new RuntimeException("Aborting transaction!");
} else if (result == DataStoreMetadataUpdateResult.TRY_AGAIN) {
throw new RetryTransactionException("Aborting transaction!");
}
}
}
final Set<DataSegment> inserted = announceHistoricalSegmentBatch(handle, segments, usedSegments);
return SegmentPublishResult.ok(ImmutableSet.copyOf(inserted));
}
}, 3, getSqlMetadataMaxRetry());
} catch (CallbackFailedException e) {
if (definitelyNotUpdated.get()) {
return SegmentPublishResult.fail(e.getMessage());
} else {
// Must throw exception if we are not sure if we updated or not.
throw e;
}
}
}
Aggregations