use of org.apache.druid.indexing.seekablestream.common.OrderedSequenceNumber in project druid by druid-io.
the class SeekableStreamSupervisor method createNewTasks.
private void createNewTasks() throws JsonProcessingException {
// update the checkpoints in the taskGroup to latest ones so that new tasks do not read what is already published
verifyAndMergeCheckpoints(activelyReadingTaskGroups.values().stream().filter(taskGroup -> taskGroup.tasks.size() < ioConfig.getReplicas()).collect(Collectors.toList()));
// check that there is a current task group for each group of partitions in [partitionGroups]
for (Integer groupId : partitionGroups.keySet()) {
if (!activelyReadingTaskGroups.containsKey(groupId)) {
log.info("Creating new task group [%d] for partitions %s", groupId, partitionGroups.get(groupId));
Optional<DateTime> minimumMessageTime;
if (ioConfig.getLateMessageRejectionStartDateTime().isPresent()) {
minimumMessageTime = Optional.of(ioConfig.getLateMessageRejectionStartDateTime().get());
} else {
minimumMessageTime = (ioConfig.getLateMessageRejectionPeriod().isPresent() ? Optional.of(DateTimes.nowUtc().minus(ioConfig.getLateMessageRejectionPeriod().get())) : Optional.absent());
}
Optional<DateTime> maximumMessageTime = (ioConfig.getEarlyMessageRejectionPeriod().isPresent() ? Optional.of(DateTimes.nowUtc().plus(ioConfig.getTaskDuration()).plus(ioConfig.getEarlyMessageRejectionPeriod().get())) : Optional.absent());
final Map<PartitionIdType, OrderedSequenceNumber<SequenceOffsetType>> unfilteredStartingOffsets = generateStartingSequencesForPartitionGroup(groupId);
final Map<PartitionIdType, OrderedSequenceNumber<SequenceOffsetType>> startingOffsets;
if (supportsPartitionExpiration()) {
startingOffsets = filterExpiredPartitionsFromStartingOffsets(unfilteredStartingOffsets);
} else {
startingOffsets = unfilteredStartingOffsets;
}
ImmutableMap<PartitionIdType, SequenceOffsetType> simpleStartingOffsets = startingOffsets.entrySet().stream().filter(entry -> entry.getValue().get() != null).collect(Collectors.collectingAndThen(Collectors.toMap(Entry::getKey, entry -> entry.getValue().get()), ImmutableMap::copyOf));
ImmutableMap<PartitionIdType, SequenceOffsetType> simpleUnfilteredStartingOffsets;
if (supportsPartitionExpiration()) {
simpleUnfilteredStartingOffsets = unfilteredStartingOffsets.entrySet().stream().filter(entry -> entry.getValue().get() != null).collect(Collectors.collectingAndThen(Collectors.toMap(Entry::getKey, entry -> entry.getValue().get()), ImmutableMap::copyOf));
} else {
simpleUnfilteredStartingOffsets = simpleStartingOffsets;
}
Set<PartitionIdType> exclusiveStartSequenceNumberPartitions;
if (!useExclusiveStartingSequence) {
exclusiveStartSequenceNumberPartitions = Collections.emptySet();
} else {
exclusiveStartSequenceNumberPartitions = startingOffsets.entrySet().stream().filter(x -> x.getValue().get() != null && x.getValue().isExclusive()).map(Entry::getKey).collect(Collectors.toSet());
}
activelyReadingTaskGroups.put(groupId, new TaskGroup(groupId, simpleStartingOffsets, simpleUnfilteredStartingOffsets, minimumMessageTime, maximumMessageTime, exclusiveStartSequenceNumberPartitions));
}
}
// iterate through all the current task groups and make sure each one has the desired number of replica tasks
boolean createdTask = false;
for (Entry<Integer, TaskGroup> entry : activelyReadingTaskGroups.entrySet()) {
TaskGroup taskGroup = entry.getValue();
Integer groupId = entry.getKey();
if (taskGroup.startingSequences == null || taskGroup.startingSequences.size() == 0 || taskGroup.startingSequences.values().stream().allMatch(x -> x == null || isEndOfShard(x))) {
log.debug("Nothing to read in any partition for taskGroup [%d], skipping task creation", groupId);
continue;
}
if (ioConfig.getReplicas() > taskGroup.tasks.size()) {
log.info("Number of tasks [%d] does not match configured numReplicas [%d] in task group [%d], creating more tasks", taskGroup.tasks.size(), ioConfig.getReplicas(), groupId);
createTasksForGroup(groupId, ioConfig.getReplicas() - taskGroup.tasks.size());
createdTask = true;
}
}
if (createdTask && firstRunTime.isBeforeNow()) {
// Schedule a run event after a short delay to update our internal data structures with the new tasks that were
// just created. This is mainly for the benefit of the status API in situations where the run period is lengthy.
scheduledExec.schedule(buildRunTask(), 5000, TimeUnit.MILLISECONDS);
}
}
use of org.apache.druid.indexing.seekablestream.common.OrderedSequenceNumber in project druid by druid-io.
the class SeekableStreamSupervisor method verifyAndMergeCheckpoints.
/**
* This method does two things -
* 1. Makes sure the checkpoints information in the taskGroup is consistent with that of the tasks, if not kill
* inconsistent tasks.
* 2. truncates the checkpoints in the taskGroup corresponding to which segments have been published, so that any newly
* created tasks for the taskGroup start indexing from after the latest published sequences.
*/
private void verifyAndMergeCheckpoints(final TaskGroup taskGroup) {
final int groupId = taskGroup.groupId;
final List<Pair<String, TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>>>> taskSequences = new ArrayList<>();
final List<ListenableFuture<TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>>>> futures = new ArrayList<>();
final List<String> taskIds = new ArrayList<>();
for (String taskId : taskGroup.taskIds()) {
final ListenableFuture<TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>>> checkpointsFuture = taskClient.getCheckpointsAsync(taskId, true);
futures.add(checkpointsFuture);
taskIds.add(taskId);
}
try {
List<TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>>> futuresResult = Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS);
for (int i = 0; i < futuresResult.size(); i++) {
final TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>> checkpoints = futuresResult.get(i);
final String taskId = taskIds.get(i);
if (checkpoints == null) {
try {
// catch the exception in failed futures
futures.get(i).get();
} catch (Exception e) {
stateManager.recordThrowableEvent(e);
log.error(e, "Problem while getting checkpoints for task [%s], killing the task", taskId);
killTask(taskId, "Exception[%s] while getting checkpoints", e.getClass());
taskGroup.tasks.remove(taskId);
}
} else if (checkpoints.isEmpty()) {
log.warn("Ignoring task [%s], as probably it is not started running yet", taskId);
} else {
taskSequences.add(new Pair<>(taskId, checkpoints));
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
final DataSourceMetadata rawDataSourceMetadata = indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(dataSource);
if (rawDataSourceMetadata != null && !checkSourceMetadataMatch(rawDataSourceMetadata)) {
throw new IAE("Datasource metadata instance does not match required, found instance of [%s]", rawDataSourceMetadata.getClass());
}
@SuppressWarnings("unchecked") final SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType> latestDataSourceMetadata = (SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType>) rawDataSourceMetadata;
final boolean hasValidOffsetsFromDb = latestDataSourceMetadata != null && latestDataSourceMetadata.getSeekableStreamSequenceNumbers() != null && ioConfig.getStream().equals(latestDataSourceMetadata.getSeekableStreamSequenceNumbers().getStream());
final Map<PartitionIdType, SequenceOffsetType> latestOffsetsFromDb;
if (hasValidOffsetsFromDb) {
latestOffsetsFromDb = latestDataSourceMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap();
} else {
latestOffsetsFromDb = null;
}
// order tasks of this taskGroup by the latest sequenceId
taskSequences.sort((o1, o2) -> o2.rhs.firstKey().compareTo(o1.rhs.firstKey()));
final Set<String> tasksToKill = new HashSet<>();
final AtomicInteger earliestConsistentSequenceId = new AtomicInteger(-1);
int taskIndex = 0;
while (taskIndex < taskSequences.size()) {
TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>> taskCheckpoints = taskSequences.get(taskIndex).rhs;
String taskId = taskSequences.get(taskIndex).lhs;
if (earliestConsistentSequenceId.get() == -1) {
// store
if (taskCheckpoints.entrySet().stream().anyMatch(sequenceCheckpoint -> sequenceCheckpoint.getValue().entrySet().stream().allMatch(partitionOffset -> {
OrderedSequenceNumber<SequenceOffsetType> sequence = makeSequenceNumber(partitionOffset.getValue());
OrderedSequenceNumber<SequenceOffsetType> latestOffset = makeSequenceNumber(latestOffsetsFromDb == null ? partitionOffset.getValue() : latestOffsetsFromDb.getOrDefault(partitionOffset.getKey(), partitionOffset.getValue()));
return sequence.compareTo(latestOffset) == 0;
}) && earliestConsistentSequenceId.compareAndSet(-1, sequenceCheckpoint.getKey())) || (pendingCompletionTaskGroups.getOrDefault(groupId, new CopyOnWriteArrayList<>()).size() > 0 && earliestConsistentSequenceId.compareAndSet(-1, taskCheckpoints.firstKey()))) {
final SortedMap<Integer, Map<PartitionIdType, SequenceOffsetType>> latestCheckpoints = new TreeMap<>(taskCheckpoints.tailMap(earliestConsistentSequenceId.get()));
log.info("Setting taskGroup sequences to [%s] for group [%d]", latestCheckpoints, groupId);
taskGroup.checkpointSequences.clear();
taskGroup.checkpointSequences.putAll(latestCheckpoints);
} else {
log.debug("Adding task [%s] to kill list, checkpoints[%s], latestoffsets from DB [%s]", taskId, taskCheckpoints, latestOffsetsFromDb);
tasksToKill.add(taskId);
}
} else {
// check consistency with taskGroup sequences
if (taskCheckpoints.get(taskGroup.checkpointSequences.firstKey()) == null || !(taskCheckpoints.get(taskGroup.checkpointSequences.firstKey()).equals(taskGroup.checkpointSequences.firstEntry().getValue())) || taskCheckpoints.tailMap(taskGroup.checkpointSequences.firstKey()).size() != taskGroup.checkpointSequences.size()) {
log.debug("Adding task [%s] to kill list, checkpoints[%s], taskgroup checkpoints [%s]", taskId, taskCheckpoints, taskGroup.checkpointSequences);
tasksToKill.add(taskId);
}
}
taskIndex++;
}
if ((tasksToKill.size() > 0 && tasksToKill.size() == taskGroup.tasks.size()) || (taskGroup.tasks.size() == 0 && pendingCompletionTaskGroups.getOrDefault(groupId, new CopyOnWriteArrayList<>()).size() == 0)) {
// killing all tasks or no task left in the group ?
// clear state about the taskgroup so that get latest sequence information is fetched from metadata store
log.warn("Clearing task group [%d] information as no valid tasks left the group", groupId);
activelyReadingTaskGroups.remove(groupId);
for (PartitionIdType partitionId : taskGroup.startingSequences.keySet()) {
partitionOffsets.put(partitionId, getNotSetMarker());
}
}
taskSequences.stream().filter(taskIdSequences -> tasksToKill.contains(taskIdSequences.lhs)).forEach(sequenceCheckpoint -> {
killTask(sequenceCheckpoint.lhs, "Killing task [%s], as its checkpoints [%s] are not consistent with group checkpoints[%s] or latest " + "persisted sequences in metadata store [%s]", sequenceCheckpoint.lhs, sequenceCheckpoint.rhs, taskGroup.checkpointSequences, latestOffsetsFromDb);
taskGroup.tasks.remove(sequenceCheckpoint.lhs);
});
}
Aggregations