use of org.apache.samza.checkpoint.Checkpoint in project samza by apache.
the class BlobStoreUtil method getStoreSnapshotIndexes.
/**
* Get the blob id of {@link SnapshotIndex} and {@link SnapshotIndex}es for the provided {@code task}
* in the provided {@code checkpoint}.
* @param jobName job name is used to build request metadata
* @param jobId job id is used to build request metadata
* @param taskName task name to get the store state checkpoint markers and snapshot indexes for
* @param checkpoint {@link Checkpoint} instance to get the store state checkpoint markers from. Only
* {@link CheckpointV2} and newer are supported for blob stores.
* @param storesToBackupOrRestore set of store names to be backed up or restored
* @return Map of store name to its blob id of snapshot indices and their corresponding snapshot indices for the task.
*/
public Map<String, Pair<String, SnapshotIndex>> getStoreSnapshotIndexes(String jobName, String jobId, String taskName, Checkpoint checkpoint, Set<String> storesToBackupOrRestore) {
// TODO MED shesharma document error handling (checkpoint ver, blob not found, getBlob)
if (checkpoint == null) {
LOG.debug("No previous checkpoint found for taskName: {}", taskName);
return ImmutableMap.of();
}
if (checkpoint.getVersion() == 1) {
LOG.warn("Checkpoint version 1 is not supported for blob store backup and restore.");
return ImmutableMap.of();
}
Map<String, CompletableFuture<Pair<String, SnapshotIndex>>> storeSnapshotIndexFutures = new HashMap<>();
CheckpointV2 checkpointV2 = (CheckpointV2) checkpoint;
Map<String, Map<String, String>> factoryToStoreSCMs = checkpointV2.getStateCheckpointMarkers();
Map<String, String> storeSnapshotIndexBlobIds = factoryToStoreSCMs.get(BlobStoreStateBackendFactory.class.getName());
if (storeSnapshotIndexBlobIds != null) {
storeSnapshotIndexBlobIds.forEach((storeName, snapshotIndexBlobId) -> {
if (storesToBackupOrRestore.contains(storeName)) {
try {
LOG.debug("Getting snapshot index for taskName: {} store: {} blobId: {}", taskName, storeName, snapshotIndexBlobId);
Metadata requestMetadata = new Metadata(Metadata.SNAPSHOT_INDEX_PAYLOAD_PATH, Optional.empty(), jobName, jobId, taskName, storeName);
CompletableFuture<SnapshotIndex> snapshotIndexFuture = getSnapshotIndex(snapshotIndexBlobId, requestMetadata).toCompletableFuture();
Pair<CompletableFuture<String>, CompletableFuture<SnapshotIndex>> pairOfFutures = Pair.of(CompletableFuture.completedFuture(snapshotIndexBlobId), snapshotIndexFuture);
// save the future and block once in the end instead of blocking for each request.
storeSnapshotIndexFutures.put(storeName, FutureUtil.toFutureOfPair(pairOfFutures));
} catch (Exception e) {
throw new SamzaException(String.format("Error getting SnapshotIndex for blobId: %s for taskName: %s store: %s", snapshotIndexBlobId, taskName, storeName), e);
}
} else {
LOG.debug("SnapshotIndex blob id {} for store {} is not present in the set of stores to be backed up/restores: {}", snapshotIndexBlobId, storeName, storesToBackupOrRestore);
}
});
} else {
LOG.debug("No store SCMs found for blob store state backend in for taskName: {} in checkpoint {}", taskName, checkpointV2.getCheckpointId());
}
try {
return FutureUtil.toFutureOfMap(t -> {
Throwable unwrappedException = FutureUtil.unwrapExceptions(CompletionException.class, t);
if (unwrappedException instanceof DeletedException) {
LOG.warn("Ignoring already deleted snapshot index for taskName: {}", taskName, t);
return true;
} else {
return false;
}
}, storeSnapshotIndexFutures).join();
} catch (Exception e) {
throw new SamzaException(String.format("Error while waiting to get store snapshot indexes for task %s", taskName), e);
}
}
use of org.apache.samza.checkpoint.Checkpoint in project samza by apache.
the class ContainerStorageManager method restoreStores.
// Restoration of all stores, in parallel across tasks
private void restoreStores() throws InterruptedException {
LOG.info("Store Restore started");
Set<TaskName> activeTasks = getTasks(containerModel, TaskMode.Active).keySet();
// TODO HIGH dchen verify davinci lifecycle
// Find all non-side input stores
Set<String> nonSideInputStoreNames = storageEngineFactories.keySet().stream().filter(storeName -> !sideInputStoreNames.contains(storeName)).collect(Collectors.toSet());
// Obtain the checkpoints for each task
Map<TaskName, Map<String, TaskRestoreManager>> taskRestoreManagers = new HashMap<>();
Map<TaskName, Checkpoint> taskCheckpoints = new HashMap<>();
containerModel.getTasks().forEach((taskName, taskModel) -> {
Checkpoint taskCheckpoint = null;
if (checkpointManager != null && activeTasks.contains(taskName)) {
// only pass in checkpoints for active tasks
taskCheckpoint = checkpointManager.readLastCheckpoint(taskName);
LOG.info("Obtained checkpoint: {} for state restore for taskName: {}", taskCheckpoint, taskName);
}
taskCheckpoints.put(taskName, taskCheckpoint);
Map<String, Set<String>> backendFactoryStoreNames = getBackendFactoryStoreNames(taskCheckpoint, nonSideInputStoreNames, new StorageConfig(config));
Map<String, TaskRestoreManager> taskStoreRestoreManagers = createTaskRestoreManagers(restoreStateBackendFactories, backendFactoryStoreNames, clock, samzaContainerMetrics, taskName, taskModel);
taskRestoreManagers.put(taskName, taskStoreRestoreManagers);
});
// Initialize each TaskStorageManager
taskRestoreManagers.forEach((taskName, restoreManagers) -> restoreManagers.forEach((factoryName, taskRestoreManager) -> taskRestoreManager.init(taskCheckpoints.get(taskName))));
// Start each store consumer once.
// Note: These consumers are per system and only changelog system store consumers will be started.
// Some TaskRestoreManagers may not require the consumer to to be started, but due to the agnostic nature of
// ContainerStorageManager we always start the changelog consumer here in case it is required
this.storeConsumers.values().stream().distinct().forEach(SystemConsumer::start);
List<Future<Void>> taskRestoreFutures = new ArrayList<>();
// Submit restore callable for each taskInstance
taskRestoreManagers.forEach((taskInstance, restoreManagersMap) -> {
// Submit for each restore factory
restoreManagersMap.forEach((factoryName, taskRestoreManager) -> {
long startTime = System.currentTimeMillis();
String taskName = taskInstance.getTaskName();
LOG.info("Starting restore for state for task: {}", taskName);
CompletableFuture<Void> restoreFuture = taskRestoreManager.restore().handle((res, ex) -> {
// on stop, so paralleling stop() also parallelizes their compaction (a time-intensive operation).
try {
taskRestoreManager.close();
} catch (Exception e) {
LOG.error("Error closing restore manager for task: {} after {} restore", taskName, ex != null ? "unsuccessful" : "successful", e);
// ignore exception from close. container may still be be able to continue processing/backups
// if restore manager close fails.
}
long timeToRestore = System.currentTimeMillis() - startTime;
if (samzaContainerMetrics != null) {
Gauge taskGauge = samzaContainerMetrics.taskStoreRestorationMetrics().getOrDefault(taskInstance, null);
if (taskGauge != null) {
taskGauge.set(timeToRestore);
}
}
if (ex != null) {
// log and rethrow exception to communicate restore failure
String msg = String.format("Error restoring state for task: %s", taskName);
LOG.error(msg, ex);
// wrap in unchecked exception to throw from lambda
throw new SamzaException(msg, ex);
} else {
return null;
}
});
taskRestoreFutures.add(restoreFuture);
});
});
// as samza exceptions
for (Future<Void> future : taskRestoreFutures) {
try {
future.get();
} catch (InterruptedException e) {
LOG.warn("Received an interrupt during store restoration. Interrupting the restore executor to exit " + "prematurely without restoring full state.");
restoreExecutor.shutdownNow();
throw e;
} catch (Exception e) {
LOG.error("Exception when restoring state.", e);
throw new SamzaException("Exception when restoring state.", e);
}
}
// Stop each store consumer once
this.storeConsumers.values().stream().distinct().forEach(SystemConsumer::stop);
// Now create persistent non side input stores in read-write mode, leave non-persistent stores as-is
this.taskStores = createTaskStores(nonSideInputStoreNames, this.containerModel, jobContext, containerContext, storageEngineFactories, serdes, taskInstanceMetrics, taskInstanceCollectors);
// Add in memory stores
this.inMemoryStores.forEach((taskName, stores) -> {
if (!this.taskStores.containsKey(taskName)) {
taskStores.put(taskName, new HashMap<>());
}
taskStores.get(taskName).putAll(stores);
});
// Add side input stores
this.sideInputStores.forEach((taskName, stores) -> {
if (!this.taskStores.containsKey(taskName)) {
taskStores.put(taskName, new HashMap<>());
}
taskStores.get(taskName).putAll(stores);
});
LOG.info("Store Restore complete");
}
use of org.apache.samza.checkpoint.Checkpoint in project samza by apache.
the class ITestAzureCheckpointManager method testStoringAndReadingCheckpointsMultiTasks.
@Test
public void testStoringAndReadingCheckpointsMultiTasks() {
Partition partition = new Partition(0);
Partition partition1 = new Partition(1);
TaskName taskName = new TaskName("taskName1");
TaskName taskName1 = new TaskName("taskName2");
SystemStreamPartition ssp = new SystemStreamPartition("Azure", "Stream", partition);
SystemStreamPartition ssp1 = new SystemStreamPartition("Azure", "Stream", partition1);
Map<SystemStreamPartition, String> sspMap = new HashMap<>();
sspMap.put(ssp, "12345");
sspMap.put(ssp1, "54321");
Checkpoint cp1 = new CheckpointV1(sspMap);
Map<SystemStreamPartition, String> sspMap2 = new HashMap<>();
sspMap2.put(ssp, "12347");
sspMap2.put(ssp1, "54323");
Checkpoint cp2 = new CheckpointV1(sspMap2);
checkpointManager.register(taskName);
checkpointManager.register(taskName1);
checkpointManager.writeCheckpoint(taskName, cp1);
checkpointManager.writeCheckpoint(taskName1, cp2);
Checkpoint readCp1 = checkpointManager.readLastCheckpoint(taskName);
Assert.assertNotNull(readCp1);
Assert.assertEquals(cp1, readCp1);
Checkpoint readCp2 = checkpointManager.readLastCheckpoint(taskName1);
Assert.assertNotNull(readCp2);
Assert.assertEquals(cp2, readCp2);
checkpointManager.writeCheckpoint(taskName, cp2);
checkpointManager.writeCheckpoint(taskName1, cp1);
readCp1 = checkpointManager.readLastCheckpoint(taskName1);
Assert.assertEquals(cp1, readCp1);
readCp2 = checkpointManager.readLastCheckpoint(taskName);
Assert.assertEquals(cp2, readCp2);
}
use of org.apache.samza.checkpoint.Checkpoint in project samza by apache.
the class TestBlobStoreUtil method testGetSSIThrowsExceptionIfAnyNonIgnoredAsyncBlobStoreErrors.
@Test
public void testGetSSIThrowsExceptionIfAnyNonIgnoredAsyncBlobStoreErrors() {
String store = "storeName1";
String otherStore = "storeName2";
Set<String> storesToBackupOrRestore = new HashSet<>();
storesToBackupOrRestore.add(store);
storesToBackupOrRestore.add(otherStore);
Checkpoint checkpoint = createCheckpointV2(BlobStoreStateBackendFactory.class.getName(), ImmutableMap.of(store, "snapshotIndexBlobId1", otherStore, "snapshotIndexBlobId2"));
SnapshotIndex store1SnapshotIndex = mock(SnapshotIndex.class);
BlobStoreUtil mockBlobStoreUtil = mock(BlobStoreUtil.class);
when(mockBlobStoreUtil.getStoreSnapshotIndexes(anyString(), anyString(), anyString(), any(Checkpoint.class), anySetOf(String.class))).thenCallRealMethod();
RuntimeException nonIgnoredException = new RuntimeException();
CompletableFuture<SnapshotIndex> failedFuture = FutureUtil.failedFuture(nonIgnoredException);
when(mockBlobStoreUtil.getSnapshotIndex(eq("snapshotIndexBlobId1"), any(Metadata.class))).thenReturn(// should fail even if some errors are ignored
FutureUtil.failedFuture(new DeletedException()));
when(mockBlobStoreUtil.getSnapshotIndex(eq("snapshotIndexBlobId2"), any(Metadata.class))).thenReturn(failedFuture);
try {
mockBlobStoreUtil.getStoreSnapshotIndexes("testJobName", "testJobId", "taskName", checkpoint, storesToBackupOrRestore);
fail("Should have thrown an exception");
} catch (Exception e) {
Throwable cause = FutureUtil.unwrapExceptions(CompletionException.class, FutureUtil.unwrapExceptions(SamzaException.class, e));
assertEquals(nonIgnoredException, cause);
}
}
use of org.apache.samza.checkpoint.Checkpoint in project samza by apache.
the class TestBlobStoreUtil method testGetSSIReturnsCorrectSCMSnapshotIndexPair.
@Test
public void testGetSSIReturnsCorrectSCMSnapshotIndexPair() {
String storeName = "storeName";
String otherStoreName = "otherStoreName";
Set<String> storesToBackupOrRestore = ImmutableSet.of(storeName, otherStoreName);
String storeSnapshotIndexBlobId = "snapshotIndexBlobId";
String otherStoreSnapshotIndexBlobId = "otherSnapshotIndexBlobId";
SnapshotIndex mockStoreSnapshotIndex = mock(SnapshotIndex.class);
SnapshotIndex mockOtherStooreSnapshotIndex = mock(SnapshotIndex.class);
CheckpointV2 checkpoint = createCheckpointV2(BlobStoreStateBackendFactory.class.getName(), ImmutableMap.of(storeName, storeSnapshotIndexBlobId, otherStoreName, otherStoreSnapshotIndexBlobId));
BlobStoreUtil mockBlobStoreUtil = mock(BlobStoreUtil.class);
when(mockBlobStoreUtil.getSnapshotIndex(eq(storeSnapshotIndexBlobId), any(Metadata.class))).thenReturn(CompletableFuture.completedFuture(mockStoreSnapshotIndex));
when(mockBlobStoreUtil.getSnapshotIndex(eq(otherStoreSnapshotIndexBlobId), any(Metadata.class))).thenReturn(CompletableFuture.completedFuture(mockOtherStooreSnapshotIndex));
when(mockBlobStoreUtil.getStoreSnapshotIndexes(anyString(), anyString(), anyString(), any(Checkpoint.class), anySetOf(String.class))).thenCallRealMethod();
Map<String, Pair<String, SnapshotIndex>> snapshotIndexes = mockBlobStoreUtil.getStoreSnapshotIndexes("testJobName", "testJobId", "taskName", checkpoint, storesToBackupOrRestore);
assertEquals(storeSnapshotIndexBlobId, snapshotIndexes.get(storeName).getKey());
assertEquals(mockStoreSnapshotIndex, snapshotIndexes.get(storeName).getValue());
assertEquals(otherStoreSnapshotIndexBlobId, snapshotIndexes.get(otherStoreName).getKey());
assertEquals(mockOtherStooreSnapshotIndex, snapshotIndexes.get(otherStoreName).getValue());
verify(mockBlobStoreUtil, times(2)).getSnapshotIndex(anyString(), any(Metadata.class));
}
Aggregations