use of org.apache.samza.checkpoint.CheckpointId in project samza by apache.
the class TestTransactionalStateTaskRestoreManager method testStoreDeletedWhenCleanDirsFlagSet.
@Test
public void testStoreDeletedWhenCleanDirsFlagSet() {
TaskModel mockTaskModel = mock(TaskModel.class);
TaskName taskName = new TaskName("Partition 0");
when(mockTaskModel.getTaskName()).thenReturn(taskName);
Partition taskChangelogPartition = new Partition(0);
when(mockTaskModel.getChangelogPartition()).thenReturn(taskChangelogPartition);
when(mockTaskModel.getTaskMode()).thenReturn(TaskMode.Active);
String store1Name = "store1";
StorageEngine store1Engine = mock(StorageEngine.class);
StoreProperties mockStore1Properties = mock(StoreProperties.class);
when(store1Engine.getStoreProperties()).thenReturn(mockStore1Properties);
when(mockStore1Properties.isLoggedStore()).thenReturn(true);
when(mockStore1Properties.isPersistedToDisk()).thenReturn(true);
Map<String, StorageEngine> mockStoreEngines = ImmutableMap.of(store1Name, store1Engine);
String changelog1SystemName = "system1";
String changelog1StreamName = "store1Changelog";
SystemStream changelog1SystemStream = new SystemStream(changelog1SystemName, changelog1StreamName);
SystemStreamPartition changelog1SSP = new SystemStreamPartition(changelog1SystemStream, taskChangelogPartition);
SystemStreamPartitionMetadata changelog1SSPMetadata = new SystemStreamPartitionMetadata("0", "10", "11");
Map<String, SystemStream> mockStoreChangelogs = ImmutableMap.of(store1Name, changelog1SystemStream);
String changelog1CheckpointedOffset = "5";
CheckpointId checkpointId = CheckpointId.create();
KafkaStateCheckpointMarker kafkaStateCheckpointMarker = new KafkaStateCheckpointMarker(changelog1SSP, changelog1CheckpointedOffset);
ImmutableMap<String, KafkaStateCheckpointMarker> mockCheckpointedChangelogOffset = ImmutableMap.of(store1Name, kafkaStateCheckpointMarker);
Map<SystemStreamPartition, SystemStreamPartitionMetadata> mockCurrentChangelogOffsets = ImmutableMap.of(changelog1SSP, changelog1SSPMetadata);
SystemAdmins mockSystemAdmins = mock(SystemAdmins.class);
SystemAdmin mockSystemAdmin = mock(SystemAdmin.class);
when(mockSystemAdmins.getSystemAdmin(changelog1SSP.getSystem())).thenReturn(mockSystemAdmin);
StorageManagerUtil mockStorageManagerUtil = mock(StorageManagerUtil.class);
File mockLoggedStoreBaseDir = mock(File.class);
File mockNonLoggedStoreBaseDir = mock(File.class);
// set the clean.on.container.start config set on the store
Config mockConfig = new MapConfig(Collections.singletonMap("stores.store1.clean.on.container.start", "true"));
Clock mockClock = mock(Clock.class);
Mockito.when(mockSystemAdmin.offsetComparator(anyString(), anyString())).thenAnswer((Answer<Integer>) invocation -> {
String offset1 = (String) invocation.getArguments()[0];
String offset2 = (String) invocation.getArguments()[1];
return Long.valueOf(offset1).compareTo(Long.valueOf(offset2));
});
File dummyCurrentDir = new File("currentDir");
File dummyCheckpointDir = new File("checkpointDir1");
when(mockStorageManagerUtil.getTaskStoreDir(mockLoggedStoreBaseDir, store1Name, taskName, TaskMode.Active)).thenReturn(dummyCurrentDir);
when(mockStorageManagerUtil.getTaskStoreCheckpointDirs(mockLoggedStoreBaseDir, store1Name, taskName, TaskMode.Active)).thenReturn(ImmutableList.of(dummyCheckpointDir));
StoreActions storeActions = TransactionalStateTaskRestoreManager.getStoreActions(mockTaskModel, mockStoreEngines, mockStoreChangelogs, mockCheckpointedChangelogOffset, checkpointId, mockCurrentChangelogOffsets, mockSystemAdmins, mockStorageManagerUtil, mockLoggedStoreBaseDir, mockNonLoggedStoreBaseDir, mockConfig, mockClock);
// ensure that current and checkpoint directories are marked for deletion
assertEquals(2, storeActions.storeDirsToDelete.size());
assertTrue(storeActions.storeDirsToDelete.containsValue(dummyCheckpointDir));
assertTrue(storeActions.storeDirsToDelete.containsValue(dummyCurrentDir));
// ensure that we restore from the oldest changelog offset to checkpointed changelog offset
assertEquals("0", storeActions.storesToRestore.get(store1Name).startingOffset);
assertEquals(changelog1CheckpointedOffset, storeActions.storesToRestore.get(store1Name).endingOffset);
}
use of org.apache.samza.checkpoint.CheckpointId in project samza by apache.
the class TestTransactionalStateTaskRestoreManager method testGetStoreActionsForLoggedPersistentStore_FullRestoreIfEqualCheckpointedOldestAndNewestOffset.
/**
* This is the case when the changelog topic is empty but not new. E.g., if wrote 100 messages,
* then deleted 100 messages, and after compaction oldest == newest == checkpointed. In this case
* full restore does not do anything since there is nothing to restore or trim, but the code path will
* leave us in a consistent state with the appropriate stores deleted and retained.
*/
@Test
public void testGetStoreActionsForLoggedPersistentStore_FullRestoreIfEqualCheckpointedOldestAndNewestOffset() {
TaskModel mockTaskModel = mock(TaskModel.class);
TaskName taskName = new TaskName("Partition 0");
when(mockTaskModel.getTaskName()).thenReturn(taskName);
Partition taskChangelogPartition = new Partition(0);
when(mockTaskModel.getChangelogPartition()).thenReturn(taskChangelogPartition);
String store1Name = "store1";
StorageEngine store1Engine = mock(StorageEngine.class);
StoreProperties mockStore1Properties = mock(StoreProperties.class);
when(store1Engine.getStoreProperties()).thenReturn(mockStore1Properties);
when(mockStore1Properties.isLoggedStore()).thenReturn(true);
when(mockStore1Properties.isPersistedToDisk()).thenReturn(true);
Map<String, StorageEngine> mockStoreEngines = ImmutableMap.of(store1Name, store1Engine);
String changelog1SystemName = "system1";
String changelog1StreamName = "store1Changelog";
SystemStream changelog1SystemStream = new SystemStream(changelog1SystemName, changelog1StreamName);
SystemStreamPartition changelog1SSP = new SystemStreamPartition(changelog1SystemStream, taskChangelogPartition);
SystemStreamPartitionMetadata changelog1SSPMetadata = new SystemStreamPartitionMetadata("5", "5", "6");
Map<String, SystemStream> mockStoreChangelogs = ImmutableMap.of(store1Name, changelog1SystemStream);
String changelog1CheckpointedOffset = "5";
CheckpointId checkpointId = CheckpointId.create();
KafkaStateCheckpointMarker kafkaStateCheckpointMarker = new KafkaStateCheckpointMarker(changelog1SSP, changelog1CheckpointedOffset);
Map<String, KafkaStateCheckpointMarker> mockCheckpointedChangelogOffset = new HashMap<String, KafkaStateCheckpointMarker>() {
{
put(store1Name, kafkaStateCheckpointMarker);
}
};
Map<SystemStreamPartition, SystemStreamPartitionMetadata> mockCurrentChangelogOffsets = ImmutableMap.of(changelog1SSP, changelog1SSPMetadata);
SystemAdmins mockSystemAdmins = mock(SystemAdmins.class);
SystemAdmin mockSystemAdmin = mock(SystemAdmin.class);
when(mockSystemAdmins.getSystemAdmin(changelog1SSP.getSystem())).thenReturn(mockSystemAdmin);
StorageManagerUtil mockStorageManagerUtil = mock(StorageManagerUtil.class);
File mockLoggedStoreBaseDir = mock(File.class);
File mockNonLoggedStoreBaseDir = mock(File.class);
HashMap<String, String> configMap = new HashMap<>();
// should not matter
configMap.put(TaskConfig.TRANSACTIONAL_STATE_RETAIN_EXISTING_STATE, "true");
Config mockConfig = new MapConfig(configMap);
Clock mockClock = mock(Clock.class);
File mockCurrentStoreDir = mock(File.class);
File mockStoreCheckpointDir = mock(File.class);
String checkpointDirLocalOffset = "5";
when(mockStorageManagerUtil.getTaskStoreDir(eq(mockLoggedStoreBaseDir), eq(store1Name), eq(taskName), any())).thenReturn(mockCurrentStoreDir);
when(mockStorageManagerUtil.getTaskStoreCheckpointDirs(eq(mockLoggedStoreBaseDir), eq(store1Name), eq(taskName), any())).thenReturn(ImmutableList.of(mockStoreCheckpointDir));
when(mockStorageManagerUtil.isLoggedStoreValid(eq(store1Name), eq(mockStoreCheckpointDir), any(), eq(mockStoreChangelogs), eq(mockTaskModel), any(), eq(mockStoreEngines))).thenReturn(true);
Set<SystemStreamPartition> mockChangelogSSPs = ImmutableSet.of(changelog1SSP);
when(mockStorageManagerUtil.readOffsetFile(eq(mockStoreCheckpointDir), eq(mockChangelogSSPs), eq(false))).thenReturn(ImmutableMap.of(changelog1SSP, checkpointDirLocalOffset));
Mockito.when(mockSystemAdmin.offsetComparator(anyString(), anyString())).thenAnswer((Answer<Integer>) invocation -> {
String offset1 = (String) invocation.getArguments()[0];
String offset2 = (String) invocation.getArguments()[1];
if (offset1 == null || offset2 == null) {
return -1;
}
return Long.valueOf(offset1).compareTo(Long.valueOf(offset2));
});
StoreActions storeActions = TransactionalStateTaskRestoreManager.getStoreActions(mockTaskModel, mockStoreEngines, mockStoreChangelogs, mockCheckpointedChangelogOffset, checkpointId, mockCurrentChangelogOffsets, mockSystemAdmins, mockStorageManagerUtil, mockLoggedStoreBaseDir, mockNonLoggedStoreBaseDir, mockConfig, mockClock);
// ensure that current and old checkpoint dirs are marked for deletion
assertEquals(1, storeActions.storeDirsToDelete.get(store1Name).size());
assertTrue(storeActions.storeDirsToDelete.get(store1Name).contains(mockCurrentStoreDir));
// ensure that latest checkpoint dir is retained
assertEquals(mockStoreCheckpointDir, storeActions.storeDirsToRetain.get(store1Name));
// ensure that we do a full restore (on the empty topic)
assertEquals("5", storeActions.storesToRestore.get(store1Name).startingOffset);
assertEquals("5", storeActions.storesToRestore.get(store1Name).endingOffset);
}
use of org.apache.samza.checkpoint.CheckpointId in project samza by apache.
the class TransactionalStateTaskRestoreManager method getStoreActions.
/**
* Marks each persistent but non-logged store for deletion.
*
* For each logged store, based on the current, checkpointed and local changelog offsets,
* 1. decides which directories (current and checkpoints) to delete for persistent stores.
* 2. decides which directories (checkpoints) to retain for persistent stores.
* 3. decides which stores (persistent or not) need to be restored, and the beginning and end offsets for the restore.
*
* When this method returns, in StoreActions,
* 1. all persistent store current directories will be present in storeDirsToDelete
* 2. each persistent store checkpoint directory will be present in either storeDirToRetain or storeDirsToDelete.
* 3. there will be at most one storeDirToRetain per persistent store, which will be a checkpoint directory.
* 4. any stores (persistent or not) that need to be restored from changelogs will be present in
* storesToRestore with appropriate offsets.
*/
@VisibleForTesting
static StoreActions getStoreActions(TaskModel taskModel, Map<String, StorageEngine> storeEngines, Map<String, SystemStream> storeChangelogs, Map<String, KafkaStateCheckpointMarker> kafkaStateCheckpointMarkers, CheckpointId checkpointId, Map<SystemStreamPartition, SystemStreamPartitionMetadata> currentChangelogOffsets, SystemAdmins systemAdmins, StorageManagerUtil storageManagerUtil, File loggedStoreBaseDirectory, File nonLoggedStoreBaseDirectory, Config config, Clock clock) {
TaskName taskName = taskModel.getTaskName();
TaskMode taskMode = taskModel.getTaskMode();
Map<String, File> storeDirToRetain = new HashMap<>();
ListMultimap<String, File> storeDirsToDelete = ArrayListMultimap.create();
Map<String, RestoreOffsets> storesToRestore = new HashMap<>();
storeEngines.forEach((storeName, storageEngine) -> {
// do nothing if store is non persistent and not logged (e.g. in memory cache only)
if (!storageEngine.getStoreProperties().isPersistedToDisk() && !storageEngine.getStoreProperties().isLoggedStore()) {
return;
}
// persistent but non-logged stores are always deleted
if (storageEngine.getStoreProperties().isPersistedToDisk() && !storageEngine.getStoreProperties().isLoggedStore()) {
File currentDir = storageManagerUtil.getTaskStoreDir(nonLoggedStoreBaseDirectory, storeName, taskName, taskMode);
LOG.info("Marking current directory: {} for store: {} in task: {} for deletion since it is not a logged store.", currentDir, storeName, taskName);
storeDirsToDelete.put(storeName, currentDir);
// persistent but non-logged stores should not have checkpoint dirs
return;
}
// get the oldest and newest current changelog SSP offsets as well as the checkpointed changelog SSP offset
SystemStream changelog = storeChangelogs.get(storeName);
SystemStreamPartition changelogSSP = new SystemStreamPartition(changelog, taskModel.getChangelogPartition());
SystemAdmin admin = systemAdmins.getSystemAdmin(changelogSSP.getSystem());
SystemStreamPartitionMetadata changelogSSPMetadata = currentChangelogOffsets.get(changelogSSP);
String oldestOffset = changelogSSPMetadata.getOldestOffset();
String newestOffset = changelogSSPMetadata.getNewestOffset();
// can be null if no message, or message has null offset
String checkpointedOffset = null;
if (kafkaStateCheckpointMarkers.containsKey(storeName) && StringUtils.isNotBlank(kafkaStateCheckpointMarkers.get(storeName).getChangelogOffset())) {
checkpointedOffset = kafkaStateCheckpointMarkers.get(storeName).getChangelogOffset();
}
long timeSinceLastCheckpointInMs = checkpointId == null ? Long.MAX_VALUE : System.currentTimeMillis() - checkpointId.getMillis();
// if the clean.store.start config is set, delete current and checkpoint dirs, restore from oldest offset to checkpointed
if (storageEngine.getStoreProperties().isPersistedToDisk() && new StorageConfig(config).cleanLoggedStoreDirsOnStart(storeName)) {
File currentDir = storageManagerUtil.getTaskStoreDir(loggedStoreBaseDirectory, storeName, taskName, taskMode);
LOG.info("Marking current directory: {} for store: {} in task: {} for deletion due to clean.on.container.start config.", currentDir, storeName, taskName);
storeDirsToDelete.put(storeName, currentDir);
storageManagerUtil.getTaskStoreCheckpointDirs(loggedStoreBaseDirectory, storeName, taskName, taskMode).forEach(checkpointDir -> {
LOG.info("Marking checkpoint directory: {} for store: {} in task: {} for deletion due to clean.on.container.start config.", checkpointDir, storeName, taskName);
storeDirsToDelete.put(storeName, checkpointDir);
});
LOG.info("Marking restore offsets for store: {} in task: {} to {}, {} ", storeName, taskName, oldestOffset, checkpointedOffset);
storesToRestore.put(storeName, new RestoreOffsets(oldestOffset, checkpointedOffset));
return;
}
Optional<File> currentDirOptional;
Optional<List<File>> checkpointDirsOptional;
if (!storageEngine.getStoreProperties().isPersistedToDisk()) {
currentDirOptional = Optional.empty();
checkpointDirsOptional = Optional.empty();
} else {
currentDirOptional = Optional.of(storageManagerUtil.getTaskStoreDir(loggedStoreBaseDirectory, storeName, taskName, taskMode));
checkpointDirsOptional = Optional.of(storageManagerUtil.getTaskStoreCheckpointDirs(loggedStoreBaseDirectory, storeName, taskName, taskMode));
}
LOG.info("For store: {} in task: {} got current dir: {}, checkpoint dirs: {}, checkpointed changelog offset: {}", storeName, taskName, currentDirOptional, checkpointDirsOptional, checkpointedOffset);
currentDirOptional.ifPresent(currentDir -> {
LOG.info("Marking current directory: {} for store: {} in task: {} for deletion.", currentDir, storeName, taskName);
storeDirsToDelete.put(storeName, currentDir);
});
if (checkpointedOffset == null && oldestOffset != null) {
// this can mean that either this is the initial migration for this feature and there are no previously
// checkpointed changelog offsets, or that this is a new store or changelog topic after the initial migration.
// if this is the first time migration, it might be desirable to retain existing data.
// if this is new store or topic, it is possible that the container previously died after writing some data to
// the changelog but before a commit, so it is desirable to delete the store, not restore anything and
// trim the changelog
// since we can't tell the difference b/w the two scenarios by just looking at the store and changelogs,
// we'll request users to indicate whether to retain existing data using a config flag. this flag should only
// be set during migrations, and turned off after the first successful commit of the new container (i.e. next
// deploy). for simplicity, we'll always delete the local store, and restore from changelog if necessary.
// the former scenario should not be common. the recommended way to opt-in to the transactional state feature
// is to first upgrade to the latest samza version but keep the transactional state restore config off.
// this will create the store checkpoint directories and write the changelog offset to the checkpoint, but
// will not use them during restore. once this is done (i.e. at least one commit after upgrade), the
// transactional state restore feature can be turned on on subsequent deploys. this code path exists as a
// fail-safe against clearing changelogs in case users do not follow upgrade instructions and enable the
// feature directly.
checkpointDirsOptional.ifPresent(checkpointDirs -> checkpointDirs.forEach(checkpointDir -> {
LOG.info("Marking checkpoint directory: {} for store: {} in task: {} for deletion since checkpointed " + "offset is null and oldest offset: {} is not.", checkpointDir, storeName, taskName, oldestOffset);
storeDirsToDelete.put(storeName, checkpointDir);
}));
if (new TaskConfig(config).getTransactionalStateRetainExistingState()) {
// mark for restore from (oldest, newest) to recreate local state.
LOG.warn("Checkpointed offset for store: {} in task: {} is null. Since retain existing state is true, " + "local state will be fully restored from current changelog contents. " + "There is no transactional local state guarantee.", storeName, taskName);
storesToRestore.put(storeName, new RestoreOffsets(oldestOffset, newestOffset));
} else {
LOG.warn("Checkpointed offset for store: {} in task: {} is null. Since retain existing state is false, " + "any local state and changelog topic contents will be deleted.", storeName, taskName);
// mark for restore from (oldest, null) to trim entire changelog.
storesToRestore.put(storeName, new RestoreOffsets(oldestOffset, null));
}
} else if (// check if the checkpointed offset is out of range of current oldest and newest offsets
admin.offsetComparator(oldestOffset, checkpointedOffset) > 0 || admin.offsetComparator(checkpointedOffset, newestOffset) > 0) {
// checkpointed offset is out of range. this could mean that this is a TTL topic and the checkpointed
// offset was TTLd, or that the changelog topic was manually deleted and then recreated.
// we cannot guarantee transactional state for TTL stores, so delete everything and do a full restore
// for local store. if the topic was deleted and recreated, this will have the side effect of
// clearing the store as well.
LOG.warn("Checkpointed offset: {} for store: {} in task: {} is out of range of oldest: {} or newest: {} offset." + "Deleting existing store and fully restoring from changelog topic from oldest to newest offset. If the topic " + "has time-based retention, there is no transactional local state guarantees. If the topic was changed," + "local state will be cleaned up and fully restored to match the new topic contents.", checkpointedOffset, storeName, taskName, oldestOffset, newestOffset);
checkpointDirsOptional.ifPresent(checkpointDirs -> checkpointDirs.forEach(checkpointDir -> storeDirsToDelete.put(storeName, checkpointDir)));
storesToRestore.put(storeName, new RestoreOffsets(oldestOffset, newestOffset));
} else {
// happy path. checkpointed offset is in range of current oldest and newest offsets
if (!checkpointDirsOptional.isPresent()) {
// non-persistent logged store
LOG.info("Did not find any checkpoint directories for logged (maybe non-persistent) store: {}. Local state " + "will be fully restored from current changelog contents.", storeName);
storesToRestore.put(storeName, new RestoreOffsets(oldestOffset, checkpointedOffset));
} else {
// persistent logged store
String targetOffset;
// check checkpoint time against min.compaction.lag.ms. if older, restore from checkpointed offset to newest
// with no trim. be conservative. allow 10% safety margin to avoid deletions when the downtime is close
// to min.compaction.lag.ms
long minCompactionLagMs = new StorageConfig(config).getChangelogMinCompactionLagMs(storeName);
if (timeSinceLastCheckpointInMs > .9 * minCompactionLagMs) {
LOG.warn("Checkpointed offset for store: {} in task: {} is: {}. It is in range of oldest: {} and " + "newest: {} changelog offset. However, time since last checkpoint is: {}, which is greater than " + "0.9 * min.compaction.lag.ms: {} for the changelog topic. Since there is a chance that" + "the changelog topic has been compacted, restoring store to the end of the current changelog contents." + "There is no transactional local state guarantee.", storeName, taskName, checkpointedOffset, oldestOffset, newestOffset, timeSinceLastCheckpointInMs, minCompactionLagMs);
targetOffset = newestOffset;
} else {
targetOffset = checkpointedOffset;
}
// if there exists a valid store checkpoint directory with oldest offset <= local offset <= target offset,
// retain it and restore the delta. delete all other checkpoint directories for the store. if more than one such
// checkpoint directory exists, retain the one with the highest local offset and delete the rest.
boolean hasValidCheckpointDir = false;
for (File checkpointDir : checkpointDirsOptional.get()) {
if (storageManagerUtil.isLoggedStoreValid(storeName, checkpointDir, config, storeChangelogs, taskModel, clock, storeEngines)) {
String localOffset = storageManagerUtil.readOffsetFile(checkpointDir, Collections.singleton(changelogSSP), false).get(changelogSSP);
LOG.info("Read local offset: {} for store: {} checkpoint dir: {} in task: {}", localOffset, storeName, checkpointDir, taskName);
if (admin.offsetComparator(localOffset, oldestOffset) >= 0 && admin.offsetComparator(localOffset, targetOffset) <= 0 && (storesToRestore.get(storeName) == null || admin.offsetComparator(localOffset, storesToRestore.get(storeName).startingOffset) > 0)) {
hasValidCheckpointDir = true;
LOG.info("Temporarily marking checkpoint dir: {} for store: {} in task: {} for retention. " + "May be overridden later.", checkpointDir, storeName, taskName);
storeDirToRetain.put(storeName, checkpointDir);
// mark for restore even if local == checkpointed, so that the changelog gets trimmed.
LOG.info("Temporarily marking store: {} in task: {} for restore from beginning offset: {} to " + "ending offset: {}. May be overridden later", storeName, taskName, localOffset, targetOffset);
storesToRestore.put(storeName, new RestoreOffsets(localOffset, targetOffset));
}
}
}
// delete all non-retained checkpoint directories
for (File checkpointDir : checkpointDirsOptional.get()) {
if (storeDirToRetain.get(storeName) == null || !storeDirToRetain.get(storeName).equals(checkpointDir)) {
LOG.info("Marking checkpoint directory: {} for store: {} in task: {} for deletion since it is not " + "marked for retention.", checkpointDir, storeName, taskName);
storeDirsToDelete.put(storeName, checkpointDir);
}
}
// if the store had not valid checkpoint dirs to retain, restore from changelog
if (!hasValidCheckpointDir) {
storesToRestore.put(storeName, new RestoreOffsets(oldestOffset, targetOffset));
}
}
}
});
LOG.info("Store directories to be retained in Task: {} are: {}", taskName, storeDirToRetain);
LOG.info("Store directories to be deleted in Task: {} are: {}", taskName, storeDirsToDelete);
LOG.info("Stores to be restored in Task: {} are: {}", taskName, storesToRestore);
return new StoreActions(storeDirToRetain, storeDirsToDelete, storesToRestore);
}
use of org.apache.samza.checkpoint.CheckpointId in project samza by apache.
the class BlobStoreRestoreManager method restoreStores.
/**
* Restores all eligible stores in the task.
*/
@VisibleForTesting
static CompletableFuture<Void> restoreStores(String jobName, String jobId, TaskName taskName, Set<String> storesToRestore, Map<String, Pair<String, SnapshotIndex>> prevStoreSnapshotIndexes, File loggedBaseDir, StorageConfig storageConfig, BlobStoreRestoreManagerMetrics metrics, StorageManagerUtil storageManagerUtil, BlobStoreUtil blobStoreUtil, DirDiffUtil dirDiffUtil, ExecutorService executor) {
long restoreStartTime = System.nanoTime();
List<CompletionStage<Void>> restoreFutures = new ArrayList<>();
LOG.debug("Starting restore for task: {} stores: {}", taskName, storesToRestore);
storesToRestore.forEach(storeName -> {
if (!prevStoreSnapshotIndexes.containsKey(storeName)) {
LOG.info("No checkpointed snapshot index found for task: {} store: {}. Skipping restore.", taskName, storeName);
// blob store based backup and restore, both at the same time.
return;
}
Pair<String, SnapshotIndex> scmAndSnapshotIndex = prevStoreSnapshotIndexes.get(storeName);
long storeRestoreStartTime = System.nanoTime();
SnapshotIndex snapshotIndex = scmAndSnapshotIndex.getRight();
DirIndex dirIndex = snapshotIndex.getDirIndex();
DirIndex.Stats stats = DirIndex.getStats(dirIndex);
metrics.filesToRestore.getValue().addAndGet(stats.filesPresent);
metrics.bytesToRestore.getValue().addAndGet(stats.bytesPresent);
metrics.filesRemaining.getValue().addAndGet(stats.filesPresent);
metrics.bytesRemaining.getValue().addAndGet(stats.bytesPresent);
CheckpointId checkpointId = snapshotIndex.getSnapshotMetadata().getCheckpointId();
File storeDir = storageManagerUtil.getTaskStoreDir(loggedBaseDir, storeName, taskName, TaskMode.Active);
Path storeCheckpointDir = Paths.get(storageManagerUtil.getStoreCheckpointDir(storeDir, checkpointId));
LOG.trace("Got task: {} store: {} local store directory: {} and local store checkpoint directory: {}", taskName, storeName, storeDir, storeCheckpointDir);
// we always delete the store dir to preserve transactional state guarantees.
try {
LOG.debug("Deleting local store directory: {}. Will be restored from local store checkpoint directory " + "or remote snapshot.", storeDir);
FileUtils.deleteDirectory(storeDir);
} catch (IOException e) {
throw new SamzaException(String.format("Error deleting store directory: %s", storeDir), e);
}
boolean shouldRestore = shouldRestore(taskName.getTaskName(), storeName, dirIndex, storeCheckpointDir, storageConfig, dirDiffUtil);
if (shouldRestore) {
// restore the store from the remote blob store
// delete all store checkpoint directories. if we only delete the store directory and don't
// delete the checkpoint directories, the store size on disk will grow to 2x after restore
// until the first commit is completed and older checkpoint dirs are deleted. This is
// because the hard-linked checkpoint dir files will no longer be de-duped with the
// now-deleted main store directory contents and will take up additional space of their
// own during the restore.
deleteCheckpointDirs(taskName, storeName, loggedBaseDir, storageManagerUtil);
metrics.storePreRestoreNs.get(storeName).set(System.nanoTime() - storeRestoreStartTime);
enqueueRestore(jobName, jobId, taskName.toString(), storeName, storeDir, dirIndex, storeRestoreStartTime, restoreFutures, blobStoreUtil, dirDiffUtil, metrics, executor);
} else {
LOG.debug("Renaming store checkpoint directory: {} to store directory: {} since its contents are identical " + "to the remote snapshot.", storeCheckpointDir, storeDir);
// atomically rename the checkpoint dir to the store dir
new FileUtil().move(storeCheckpointDir.toFile(), storeDir);
// delete any other checkpoint dirs.
deleteCheckpointDirs(taskName, storeName, loggedBaseDir, storageManagerUtil);
}
});
// wait for all restores to finish
return FutureUtil.allOf(restoreFutures).whenComplete((res, ex) -> {
LOG.info("Restore completed for task: {} stores", taskName);
metrics.restoreNs.set(System.nanoTime() - restoreStartTime);
});
}
use of org.apache.samza.checkpoint.CheckpointId in project samza by apache.
the class TestBlobStoreUtil method testCleanup.
@Test
public void testCleanup() throws IOException, ExecutionException, InterruptedException {
BlobStoreManager blobStoreManager = mock(BlobStoreManager.class);
// File, dir and recursive dir added, retained and removed in local
// Using unique file names since test util uses only the file name (leaf node)
// as the mock blob id, not the full file path.
String local = "[a, c, z/1, y/2, p/m/3, q/n/4]";
String remote = "[a, b, z/1, x/5, p/m/3, r/o/6]";
String expectedRemoved = "[b, 5, 6]";
// keep only the last character (the file name).
SortedSet<String> expectedRemovedFiles = BlobStoreTestUtil.getExpected(expectedRemoved);
// Set up environment
Path localSnapshotDir = BlobStoreTestUtil.createLocalDir(local);
String basePath = localSnapshotDir.toAbsolutePath().toString();
DirIndex remoteSnapshotDir = BlobStoreTestUtil.createDirIndex(remote);
SnapshotMetadata snapshotMetadata = new SnapshotMetadata(checkpointId, jobName, jobId, taskName, storeName);
DirDiff dirDiff = DirDiffUtil.getDirDiff(localSnapshotDir.toFile(), remoteSnapshotDir, (localFile, remoteFile) -> localFile.getName().equals(remoteFile.getFileName()));
BlobStoreUtil blobStoreUtil = new BlobStoreUtil(blobStoreManager, EXECUTOR, null, null);
when(blobStoreManager.put(any(InputStream.class), any(Metadata.class))).thenReturn(CompletableFuture.completedFuture("blobId"));
CompletionStage<DirIndex> dirIndexFuture = blobStoreUtil.putDir(dirDiff, snapshotMetadata);
DirIndex dirIndex = null;
try {
// should be already complete. if not, future composition in putDir is broken.
dirIndex = dirIndexFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
} catch (TimeoutException e) {
fail("Future returned from putDir should be already complete.");
}
// Set up mocks
SortedSet<String> allDeleted = new TreeSet<>();
when(blobStoreManager.delete(anyString(), any(Metadata.class))).thenAnswer((Answer<CompletableFuture<Void>>) invocation -> {
String blobId = invocation.getArgumentAt(0, String.class);
allDeleted.add(blobId);
return CompletableFuture.completedFuture(null);
});
// Execute
CompletionStage<Void> cleanUpFuture = blobStoreUtil.cleanUpDir(dirIndex, metadata);
try {
// should be already complete. if not, future composition in putDir is broken.
cleanUpFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
} catch (TimeoutException e) {
fail("Future returned from putDir should be already complete.");
}
// Assert
assertEquals(expectedRemovedFiles, allDeleted);
}
Aggregations