use of org.apache.druid.indexing.seekablestream.SeekableStreamDataSourceMetadata in project druid by druid-io.
the class SeekableStreamSupervisor method resetInternal.
@VisibleForTesting
public void resetInternal(DataSourceMetadata dataSourceMetadata) {
if (dataSourceMetadata == null) {
// Reset everything
boolean result = indexerMetadataStorageCoordinator.deleteDataSourceMetadata(dataSource);
log.info("Reset dataSource[%s] - dataSource metadata entry deleted? [%s]", dataSource, result);
activelyReadingTaskGroups.values().forEach(group -> killTasksInGroup(group, "DataSourceMetadata is not found while reset"));
activelyReadingTaskGroups.clear();
partitionGroups.clear();
partitionOffsets.clear();
} else {
if (!checkSourceMetadataMatch(dataSourceMetadata)) {
throw new IAE("Datasource metadata instance does not match required, found instance of [%s]", dataSourceMetadata.getClass());
}
log.info("Reset dataSource[%s] with metadata[%s]", dataSource, dataSourceMetadata);
// Reset only the partitions in dataSourceMetadata if it has not been reset yet
@SuppressWarnings("unchecked") final SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType> resetMetadata = (SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType>) dataSourceMetadata;
if (resetMetadata.getSeekableStreamSequenceNumbers().getStream().equals(ioConfig.getStream())) {
// metadata can be null
final DataSourceMetadata metadata = indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(dataSource);
if (metadata != null && !checkSourceMetadataMatch(metadata)) {
throw new IAE("Datasource metadata instance does not match required, found instance of [%s]", metadata.getClass());
}
@SuppressWarnings("unchecked") final SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType> currentMetadata = (SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType>) metadata;
// defend against consecutive reset requests from replicas
// as well as the case where the metadata store do not have an entry for the reset partitions
boolean doReset = false;
for (Entry<PartitionIdType, SequenceOffsetType> resetPartitionOffset : resetMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap().entrySet()) {
final SequenceOffsetType partitionOffsetInMetadataStore = currentMetadata == null ? null : currentMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap().get(resetPartitionOffset.getKey());
final TaskGroup partitionTaskGroup = activelyReadingTaskGroups.get(getTaskGroupIdForPartition(resetPartitionOffset.getKey()));
final boolean isSameOffset = partitionTaskGroup != null && partitionTaskGroup.startingSequences.get(resetPartitionOffset.getKey()).equals(resetPartitionOffset.getValue());
if (partitionOffsetInMetadataStore != null || isSameOffset) {
doReset = true;
break;
}
}
if (!doReset) {
log.info("Ignoring duplicate reset request [%s]", dataSourceMetadata);
return;
}
boolean metadataUpdateSuccess;
if (currentMetadata == null) {
metadataUpdateSuccess = true;
} else {
final DataSourceMetadata newMetadata = currentMetadata.minus(resetMetadata);
try {
metadataUpdateSuccess = indexerMetadataStorageCoordinator.resetDataSourceMetadata(dataSource, newMetadata);
} catch (IOException e) {
log.error("Resetting DataSourceMetadata failed [%s]", e.getMessage());
throw new RuntimeException(e);
}
}
if (metadataUpdateSuccess) {
resetMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap().keySet().forEach(partition -> {
final int groupId = getTaskGroupIdForPartition(partition);
killTaskGroupForPartitions(ImmutableSet.of(partition), "DataSourceMetadata is updated while reset");
activelyReadingTaskGroups.remove(groupId);
// killTaskGroupForPartitions() cleans up partitionGroups.
// Add the removed groups back.
partitionGroups.computeIfAbsent(groupId, k -> new HashSet<>());
partitionOffsets.put(partition, getNotSetMarker());
});
} else {
throw new ISE("Unable to reset metadata");
}
} else {
log.warn("Reset metadata stream [%s] and supervisor's stream name [%s] do not match", resetMetadata.getSeekableStreamSequenceNumbers().getStream(), ioConfig.getStream());
}
}
}
use of org.apache.druid.indexing.seekablestream.SeekableStreamDataSourceMetadata in project druid by druid-io.
the class SeekableStreamSupervisor method cleanupClosedAndExpiredPartitions.
/**
* This method determines the set of expired partitions from the set of partitions currently returned by
* the record supplier and the set of partitions previously tracked in the metadata.
* <p>
* It will mark the expired partitions in metadata and recompute the partition->task group mappings, updating
* the metadata, the partitionIds list, and the partitionGroups mappings.
*
* @param storedPartitions Set of partitions previously tracked, from the metadata store
* @param newlyClosedPartitions Set of partitions that are closed in the metadata store but still present in the
* current {@link #partitionIds}
* @param activePartitionsIdsFromSupplier Set of partitions currently returned by the record supplier, but with
* any partitions that are closed/expired in the metadata store removed
* @param previouslyExpiredPartitions Set of partitions that are recorded as expired in the metadata store
* @param partitionIdsFromSupplier Set of partitions currently returned by the record supplier.
*/
private void cleanupClosedAndExpiredPartitions(Set<PartitionIdType> storedPartitions, Set<PartitionIdType> newlyClosedPartitions, Set<PartitionIdType> activePartitionsIdsFromSupplier, Set<PartitionIdType> previouslyExpiredPartitions, Set<PartitionIdType> partitionIdsFromSupplier) {
// If a partition was previously known (stored in metadata) but no longer appears in the list of partitions
// provided by the record supplier, it has expired.
Set<PartitionIdType> newlyExpiredPartitions = Sets.difference(storedPartitions, previouslyExpiredPartitions);
newlyExpiredPartitions = Sets.difference(newlyExpiredPartitions, partitionIdsFromSupplier);
if (!newlyExpiredPartitions.isEmpty()) {
log.info("Detected newly expired partitions: " + newlyExpiredPartitions);
// Mark partitions as expired in metadata
@SuppressWarnings("unchecked") SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType> currentMetadata = (SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType>) indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(dataSource);
SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType> cleanedMetadata = createDataSourceMetadataWithExpiredPartitions(currentMetadata, newlyExpiredPartitions);
log.info("New metadata after partition expiration: " + cleanedMetadata);
validateMetadataPartitionExpiration(newlyExpiredPartitions, currentMetadata, cleanedMetadata);
try {
boolean success = indexerMetadataStorageCoordinator.resetDataSourceMetadata(dataSource, cleanedMetadata);
if (!success) {
log.error("Failed to update datasource metadata[%s] with expired partitions removed", cleanedMetadata);
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
if (!newlyClosedPartitions.isEmpty()) {
log.info("Detected newly closed partitions: " + newlyClosedPartitions);
}
// Partitions have been dropped
if (!newlyClosedPartitions.isEmpty() || !newlyExpiredPartitions.isEmpty()) {
// Compute new partition groups, only including partitions that are
// still in partitionIdsFromSupplier and not closed
Map<Integer, Set<PartitionIdType>> newPartitionGroups = recomputePartitionGroupsForExpiration(activePartitionsIdsFromSupplier);
validatePartitionGroupReassignments(activePartitionsIdsFromSupplier, newPartitionGroups);
log.info("New partition groups after removing closed and expired partitions: " + newPartitionGroups);
partitionIds.clear();
partitionIds.addAll(activePartitionsIdsFromSupplier);
assignRecordSupplierToPartitionIds();
for (Integer groupId : partitionGroups.keySet()) {
if (newPartitionGroups.containsKey(groupId)) {
partitionGroups.put(groupId, newPartitionGroups.get(groupId));
} else {
partitionGroups.put(groupId, new HashSet<>());
}
}
}
}
use of org.apache.druid.indexing.seekablestream.SeekableStreamDataSourceMetadata in project druid by druid-io.
the class SeekableStreamSupervisor method verifyAndMergeCheckpoints.
/**
* This method does two things -
* 1. Makes sure the checkpoints information in the taskGroup is consistent with that of the tasks, if not kill
* inconsistent tasks.
* 2. truncates the checkpoints in the taskGroup corresponding to which segments have been published, so that any newly
* created tasks for the taskGroup start indexing from after the latest published sequences.
*/
private void verifyAndMergeCheckpoints(final TaskGroup taskGroup) {
final int groupId = taskGroup.groupId;
final List<Pair<String, TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>>>> taskSequences = new ArrayList<>();
final List<ListenableFuture<TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>>>> futures = new ArrayList<>();
final List<String> taskIds = new ArrayList<>();
for (String taskId : taskGroup.taskIds()) {
final ListenableFuture<TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>>> checkpointsFuture = taskClient.getCheckpointsAsync(taskId, true);
futures.add(checkpointsFuture);
taskIds.add(taskId);
}
try {
List<TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>>> futuresResult = Futures.successfulAsList(futures).get(futureTimeoutInSeconds, TimeUnit.SECONDS);
for (int i = 0; i < futuresResult.size(); i++) {
final TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>> checkpoints = futuresResult.get(i);
final String taskId = taskIds.get(i);
if (checkpoints == null) {
try {
// catch the exception in failed futures
futures.get(i).get();
} catch (Exception e) {
stateManager.recordThrowableEvent(e);
log.error(e, "Problem while getting checkpoints for task [%s], killing the task", taskId);
killTask(taskId, "Exception[%s] while getting checkpoints", e.getClass());
taskGroup.tasks.remove(taskId);
}
} else if (checkpoints.isEmpty()) {
log.warn("Ignoring task [%s], as probably it is not started running yet", taskId);
} else {
taskSequences.add(new Pair<>(taskId, checkpoints));
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
final DataSourceMetadata rawDataSourceMetadata = indexerMetadataStorageCoordinator.retrieveDataSourceMetadata(dataSource);
if (rawDataSourceMetadata != null && !checkSourceMetadataMatch(rawDataSourceMetadata)) {
throw new IAE("Datasource metadata instance does not match required, found instance of [%s]", rawDataSourceMetadata.getClass());
}
@SuppressWarnings("unchecked") final SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType> latestDataSourceMetadata = (SeekableStreamDataSourceMetadata<PartitionIdType, SequenceOffsetType>) rawDataSourceMetadata;
final boolean hasValidOffsetsFromDb = latestDataSourceMetadata != null && latestDataSourceMetadata.getSeekableStreamSequenceNumbers() != null && ioConfig.getStream().equals(latestDataSourceMetadata.getSeekableStreamSequenceNumbers().getStream());
final Map<PartitionIdType, SequenceOffsetType> latestOffsetsFromDb;
if (hasValidOffsetsFromDb) {
latestOffsetsFromDb = latestDataSourceMetadata.getSeekableStreamSequenceNumbers().getPartitionSequenceNumberMap();
} else {
latestOffsetsFromDb = null;
}
// order tasks of this taskGroup by the latest sequenceId
taskSequences.sort((o1, o2) -> o2.rhs.firstKey().compareTo(o1.rhs.firstKey()));
final Set<String> tasksToKill = new HashSet<>();
final AtomicInteger earliestConsistentSequenceId = new AtomicInteger(-1);
int taskIndex = 0;
while (taskIndex < taskSequences.size()) {
TreeMap<Integer, Map<PartitionIdType, SequenceOffsetType>> taskCheckpoints = taskSequences.get(taskIndex).rhs;
String taskId = taskSequences.get(taskIndex).lhs;
if (earliestConsistentSequenceId.get() == -1) {
// store
if (taskCheckpoints.entrySet().stream().anyMatch(sequenceCheckpoint -> sequenceCheckpoint.getValue().entrySet().stream().allMatch(partitionOffset -> {
OrderedSequenceNumber<SequenceOffsetType> sequence = makeSequenceNumber(partitionOffset.getValue());
OrderedSequenceNumber<SequenceOffsetType> latestOffset = makeSequenceNumber(latestOffsetsFromDb == null ? partitionOffset.getValue() : latestOffsetsFromDb.getOrDefault(partitionOffset.getKey(), partitionOffset.getValue()));
return sequence.compareTo(latestOffset) == 0;
}) && earliestConsistentSequenceId.compareAndSet(-1, sequenceCheckpoint.getKey())) || (pendingCompletionTaskGroups.getOrDefault(groupId, new CopyOnWriteArrayList<>()).size() > 0 && earliestConsistentSequenceId.compareAndSet(-1, taskCheckpoints.firstKey()))) {
final SortedMap<Integer, Map<PartitionIdType, SequenceOffsetType>> latestCheckpoints = new TreeMap<>(taskCheckpoints.tailMap(earliestConsistentSequenceId.get()));
log.info("Setting taskGroup sequences to [%s] for group [%d]", latestCheckpoints, groupId);
taskGroup.checkpointSequences.clear();
taskGroup.checkpointSequences.putAll(latestCheckpoints);
} else {
log.debug("Adding task [%s] to kill list, checkpoints[%s], latestoffsets from DB [%s]", taskId, taskCheckpoints, latestOffsetsFromDb);
tasksToKill.add(taskId);
}
} else {
// check consistency with taskGroup sequences
if (taskCheckpoints.get(taskGroup.checkpointSequences.firstKey()) == null || !(taskCheckpoints.get(taskGroup.checkpointSequences.firstKey()).equals(taskGroup.checkpointSequences.firstEntry().getValue())) || taskCheckpoints.tailMap(taskGroup.checkpointSequences.firstKey()).size() != taskGroup.checkpointSequences.size()) {
log.debug("Adding task [%s] to kill list, checkpoints[%s], taskgroup checkpoints [%s]", taskId, taskCheckpoints, taskGroup.checkpointSequences);
tasksToKill.add(taskId);
}
}
taskIndex++;
}
if ((tasksToKill.size() > 0 && tasksToKill.size() == taskGroup.tasks.size()) || (taskGroup.tasks.size() == 0 && pendingCompletionTaskGroups.getOrDefault(groupId, new CopyOnWriteArrayList<>()).size() == 0)) {
// killing all tasks or no task left in the group ?
// clear state about the taskgroup so that get latest sequence information is fetched from metadata store
log.warn("Clearing task group [%d] information as no valid tasks left the group", groupId);
activelyReadingTaskGroups.remove(groupId);
for (PartitionIdType partitionId : taskGroup.startingSequences.keySet()) {
partitionOffsets.put(partitionId, getNotSetMarker());
}
}
taskSequences.stream().filter(taskIdSequences -> tasksToKill.contains(taskIdSequences.lhs)).forEach(sequenceCheckpoint -> {
killTask(sequenceCheckpoint.lhs, "Killing task [%s], as its checkpoints [%s] are not consistent with group checkpoints[%s] or latest " + "persisted sequences in metadata store [%s]", sequenceCheckpoint.lhs, sequenceCheckpoint.rhs, taskGroup.checkpointSequences, latestOffsetsFromDb);
taskGroup.tasks.remove(sequenceCheckpoint.lhs);
});
}
Aggregations