use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.
the class ContainerStorageManager method restoreStores.
// Restoration of all stores, in parallel across tasks
private void restoreStores() throws InterruptedException {
LOG.info("Store Restore started");
Set<TaskName> activeTasks = getTasks(containerModel, TaskMode.Active).keySet();
// TODO HIGH dchen verify davinci lifecycle
// Find all non-side input stores
Set<String> nonSideInputStoreNames = storageEngineFactories.keySet().stream().filter(storeName -> !sideInputStoreNames.contains(storeName)).collect(Collectors.toSet());
// Obtain the checkpoints for each task
Map<TaskName, Map<String, TaskRestoreManager>> taskRestoreManagers = new HashMap<>();
Map<TaskName, Checkpoint> taskCheckpoints = new HashMap<>();
containerModel.getTasks().forEach((taskName, taskModel) -> {
Checkpoint taskCheckpoint = null;
if (checkpointManager != null && activeTasks.contains(taskName)) {
// only pass in checkpoints for active tasks
taskCheckpoint = checkpointManager.readLastCheckpoint(taskName);
LOG.info("Obtained checkpoint: {} for state restore for taskName: {}", taskCheckpoint, taskName);
}
taskCheckpoints.put(taskName, taskCheckpoint);
Map<String, Set<String>> backendFactoryStoreNames = getBackendFactoryStoreNames(taskCheckpoint, nonSideInputStoreNames, new StorageConfig(config));
Map<String, TaskRestoreManager> taskStoreRestoreManagers = createTaskRestoreManagers(restoreStateBackendFactories, backendFactoryStoreNames, clock, samzaContainerMetrics, taskName, taskModel);
taskRestoreManagers.put(taskName, taskStoreRestoreManagers);
});
// Initialize each TaskStorageManager
taskRestoreManagers.forEach((taskName, restoreManagers) -> restoreManagers.forEach((factoryName, taskRestoreManager) -> taskRestoreManager.init(taskCheckpoints.get(taskName))));
// Start each store consumer once.
// Note: These consumers are per system and only changelog system store consumers will be started.
// Some TaskRestoreManagers may not require the consumer to to be started, but due to the agnostic nature of
// ContainerStorageManager we always start the changelog consumer here in case it is required
this.storeConsumers.values().stream().distinct().forEach(SystemConsumer::start);
List<Future<Void>> taskRestoreFutures = new ArrayList<>();
// Submit restore callable for each taskInstance
taskRestoreManagers.forEach((taskInstance, restoreManagersMap) -> {
// Submit for each restore factory
restoreManagersMap.forEach((factoryName, taskRestoreManager) -> {
long startTime = System.currentTimeMillis();
String taskName = taskInstance.getTaskName();
LOG.info("Starting restore for state for task: {}", taskName);
CompletableFuture<Void> restoreFuture = taskRestoreManager.restore().handle((res, ex) -> {
// on stop, so paralleling stop() also parallelizes their compaction (a time-intensive operation).
try {
taskRestoreManager.close();
} catch (Exception e) {
LOG.error("Error closing restore manager for task: {} after {} restore", taskName, ex != null ? "unsuccessful" : "successful", e);
// ignore exception from close. container may still be be able to continue processing/backups
// if restore manager close fails.
}
long timeToRestore = System.currentTimeMillis() - startTime;
if (samzaContainerMetrics != null) {
Gauge taskGauge = samzaContainerMetrics.taskStoreRestorationMetrics().getOrDefault(taskInstance, null);
if (taskGauge != null) {
taskGauge.set(timeToRestore);
}
}
if (ex != null) {
// log and rethrow exception to communicate restore failure
String msg = String.format("Error restoring state for task: %s", taskName);
LOG.error(msg, ex);
// wrap in unchecked exception to throw from lambda
throw new SamzaException(msg, ex);
} else {
return null;
}
});
taskRestoreFutures.add(restoreFuture);
});
});
// as samza exceptions
for (Future<Void> future : taskRestoreFutures) {
try {
future.get();
} catch (InterruptedException e) {
LOG.warn("Received an interrupt during store restoration. Interrupting the restore executor to exit " + "prematurely without restoring full state.");
restoreExecutor.shutdownNow();
throw e;
} catch (Exception e) {
LOG.error("Exception when restoring state.", e);
throw new SamzaException("Exception when restoring state.", e);
}
}
// Stop each store consumer once
this.storeConsumers.values().stream().distinct().forEach(SystemConsumer::stop);
// Now create persistent non side input stores in read-write mode, leave non-persistent stores as-is
this.taskStores = createTaskStores(nonSideInputStoreNames, this.containerModel, jobContext, containerContext, storageEngineFactories, serdes, taskInstanceMetrics, taskInstanceCollectors);
// Add in memory stores
this.inMemoryStores.forEach((taskName, stores) -> {
if (!this.taskStores.containsKey(taskName)) {
taskStores.put(taskName, new HashMap<>());
}
taskStores.get(taskName).putAll(stores);
});
// Add side input stores
this.sideInputStores.forEach((taskName, stores) -> {
if (!this.taskStores.containsKey(taskName)) {
taskStores.put(taskName, new HashMap<>());
}
taskStores.get(taskName).putAll(stores);
});
LOG.info("Store Restore complete");
}
use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.
the class TestTaskStorageCommitManager method testRemoveOldCheckpointsWhenBaseDirContainsRegularFiles.
@Test
public void testRemoveOldCheckpointsWhenBaseDirContainsRegularFiles() {
ContainerStorageManager containerStorageManager = mock(ContainerStorageManager.class);
CheckpointManager checkpointManager = mock(CheckpointManager.class);
TaskBackupManager taskBackupManager1 = mock(TaskBackupManager.class);
TaskBackupManager taskBackupManager2 = mock(TaskBackupManager.class);
File durableStoreDir = mock(File.class);
TaskInstanceMetrics metrics = mock(TaskInstanceMetrics.class);
Timer checkpointTimer = mock(Timer.class);
when(metrics.storeCheckpointNs()).thenReturn(checkpointTimer);
StorageManagerUtil storageManagerUtil = mock(StorageManagerUtil.class);
TaskName taskName = new TaskName("task1");
Map<String, TaskBackupManager> backupManagers = ImmutableMap.of("factory1", taskBackupManager1, "factory2", taskBackupManager2);
when(containerStorageManager.getAllStores(taskName)).thenReturn(Collections.emptyMap());
TaskStorageCommitManager cm = new TaskStorageCommitManager(taskName, backupManagers, containerStorageManager, Collections.emptyMap(), new Partition(1), checkpointManager, new MapConfig(), ForkJoinPool.commonPool(), storageManagerUtil, durableStoreDir, metrics);
File mockStoreDir = mock(File.class);
String mockStoreDirName = "notDirectory";
when(durableStoreDir.listFiles()).thenReturn(new File[] { mockStoreDir });
when(mockStoreDir.getName()).thenReturn(mockStoreDirName);
when(storageManagerUtil.getTaskStoreDir(eq(durableStoreDir), eq(mockStoreDirName), eq(taskName), eq(TaskMode.Active))).thenReturn(mockStoreDir);
// null here can happen if listFiles is called on a non-directory
when(mockStoreDir.listFiles(any(FileFilter.class))).thenReturn(null);
cm.cleanUp(CheckpointId.create(), new HashMap<>()).join();
verify(durableStoreDir).listFiles();
verify(mockStoreDir).listFiles(any(FileFilter.class));
verify(storageManagerUtil).getTaskStoreDir(eq(durableStoreDir), eq(mockStoreDirName), eq(taskName), eq(TaskMode.Active));
}
use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.
the class TestTaskStorageCommitManager method testSnapshotAndCommitAllFactories.
@Test
public void testSnapshotAndCommitAllFactories() {
CheckpointManager checkpointManager = mock(CheckpointManager.class);
TaskBackupManager taskBackupManager1 = mock(TaskBackupManager.class);
TaskBackupManager taskBackupManager2 = mock(TaskBackupManager.class);
ContainerStorageManager containerStorageManager = mock(ContainerStorageManager.class);
Checkpoint checkpoint = mock(Checkpoint.class);
TaskInstanceMetrics metrics = mock(TaskInstanceMetrics.class);
Timer checkpointTimer = mock(Timer.class);
when(metrics.storeCheckpointNs()).thenReturn(checkpointTimer);
TaskName taskName = new TaskName("task1");
Map<String, TaskBackupManager> backupManagers = ImmutableMap.of("factory1", taskBackupManager1, "factory2", taskBackupManager2);
TaskStorageCommitManager cm = new TaskStorageCommitManager(taskName, backupManagers, containerStorageManager, Collections.emptyMap(), new Partition(1), checkpointManager, new MapConfig(), ForkJoinPool.commonPool(), new StorageManagerUtil(), null, metrics);
when(checkpointManager.readLastCheckpoint(taskName)).thenReturn(checkpoint);
cm.init();
verify(taskBackupManager1).init(eq(checkpoint));
verify(taskBackupManager2).init(eq(checkpoint));
CheckpointId newCheckpointId = CheckpointId.create();
Map<String, String> factory1Checkpoints = ImmutableMap.of("store1", "system;stream;1", "store2", "system;stream;2");
Map<String, String> factory2Checkpoints = ImmutableMap.of("store1", "blobId1", "store2", "blobId2");
when(containerStorageManager.getAllStores(taskName)).thenReturn(Collections.emptyMap());
when(taskBackupManager1.snapshot(newCheckpointId)).thenReturn(factory1Checkpoints);
when(taskBackupManager2.snapshot(newCheckpointId)).thenReturn(factory2Checkpoints);
when(taskBackupManager1.upload(newCheckpointId, factory1Checkpoints)).thenReturn(CompletableFuture.completedFuture(factory1Checkpoints));
when(taskBackupManager2.upload(newCheckpointId, factory2Checkpoints)).thenReturn(CompletableFuture.completedFuture(factory2Checkpoints));
Map<String, Map<String, String>> snapshotSCMs = cm.snapshot(newCheckpointId);
cm.upload(newCheckpointId, snapshotSCMs);
// Test flow for snapshot
verify(taskBackupManager1).snapshot(newCheckpointId);
verify(taskBackupManager2).snapshot(newCheckpointId);
// Test flow for upload
verify(taskBackupManager1).upload(newCheckpointId, factory1Checkpoints);
verify(taskBackupManager2).upload(newCheckpointId, factory2Checkpoints);
verify(checkpointTimer).update(anyLong());
}
use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.
the class TestTaskStorageCommitManager method testCleanupAllBackupManagers.
@Test
public void testCleanupAllBackupManagers() {
CheckpointManager checkpointManager = mock(CheckpointManager.class);
TaskBackupManager taskBackupManager1 = mock(TaskBackupManager.class);
TaskBackupManager taskBackupManager2 = mock(TaskBackupManager.class);
ContainerStorageManager containerStorageManager = mock(ContainerStorageManager.class);
Checkpoint checkpoint = mock(Checkpoint.class);
File durableStoreDir = mock(File.class);
when(durableStoreDir.listFiles()).thenReturn(new File[0]);
TaskInstanceMetrics metrics = mock(TaskInstanceMetrics.class);
Timer checkpointTimer = mock(Timer.class);
when(metrics.storeCheckpointNs()).thenReturn(checkpointTimer);
TaskName taskName = new TaskName("task1");
Map<String, TaskBackupManager> backupManagers = ImmutableMap.of("factory1", taskBackupManager1, "factory2", taskBackupManager2);
TaskStorageCommitManager cm = new TaskStorageCommitManager(taskName, backupManagers, containerStorageManager, Collections.emptyMap(), new Partition(1), checkpointManager, new MapConfig(), ForkJoinPool.commonPool(), new StorageManagerUtil(), durableStoreDir, metrics);
when(checkpointManager.readLastCheckpoint(taskName)).thenReturn(checkpoint);
when(containerStorageManager.getAllStores(taskName)).thenReturn(Collections.emptyMap());
when(taskBackupManager1.cleanUp(any(), any())).thenReturn(CompletableFuture.<Void>completedFuture(null));
when(taskBackupManager2.cleanUp(any(), any())).thenReturn(CompletableFuture.<Void>completedFuture(null));
Map<String, String> factory1Checkpoints = ImmutableMap.of("store1", "system;stream;1", "store2", "system;stream;2");
Map<String, String> factory2Checkpoints = ImmutableMap.of("store1", "blobId1", "store2", "blobId2");
Map<String, Map<String, String>> factoryCheckpointsMap = ImmutableMap.of("factory1", factory1Checkpoints, "factory2", factory2Checkpoints);
when(taskBackupManager1.cleanUp(any(), any())).thenReturn(CompletableFuture.completedFuture(null));
when(taskBackupManager2.cleanUp(any(), any())).thenReturn(CompletableFuture.completedFuture(null));
CheckpointId newCheckpointId = CheckpointId.create();
cm.cleanUp(newCheckpointId, factoryCheckpointsMap).join();
verify(taskBackupManager1).cleanUp(newCheckpointId, factory1Checkpoints);
verify(taskBackupManager2).cleanUp(newCheckpointId, factory2Checkpoints);
}
use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.
the class TestTaskStorageCommitManager method testCleanupFailsIfBackupManagerNotInitiated.
@Test
public void testCleanupFailsIfBackupManagerNotInitiated() {
CheckpointManager checkpointManager = mock(CheckpointManager.class);
ContainerStorageManager containerStorageManager = mock(ContainerStorageManager.class);
Checkpoint checkpoint = mock(Checkpoint.class);
File durableStoreDir = mock(File.class);
when(durableStoreDir.listFiles()).thenReturn(new File[0]);
TaskInstanceMetrics metrics = mock(TaskInstanceMetrics.class);
Timer checkpointTimer = mock(Timer.class);
when(metrics.storeCheckpointNs()).thenReturn(checkpointTimer);
TaskName taskName = new TaskName("task1");
when(containerStorageManager.getAllStores(taskName)).thenReturn(Collections.emptyMap());
TaskStorageCommitManager cm = new TaskStorageCommitManager(taskName, Collections.emptyMap(), containerStorageManager, Collections.emptyMap(), new Partition(1), checkpointManager, new MapConfig(), ForkJoinPool.commonPool(), new StorageManagerUtil(), durableStoreDir, metrics);
when(checkpointManager.readLastCheckpoint(taskName)).thenReturn(checkpoint);
Map<String, Map<String, String>> factoryCheckpointsMap = ImmutableMap.of(// factory 3 should be ignored
"factory3", // factory 3 should be ignored
Collections.emptyMap());
CheckpointId newCheckpointId = CheckpointId.create();
cm.cleanUp(newCheckpointId, factoryCheckpointsMap);
// should not fail the commit because the job should ignore any factories checkpoints not initialized
// in case the user is in a migration phase from on state backend to another
}
Aggregations