Search in sources :

Example 6 with CheckpointManager

use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.

the class ContainerStorageManager method restoreStores.

// Restoration of all stores, in parallel across tasks
private void restoreStores() throws InterruptedException {
    LOG.info("Store Restore started");
    Set<TaskName> activeTasks = getTasks(containerModel, TaskMode.Active).keySet();
    // TODO HIGH dchen verify davinci lifecycle
    // Find all non-side input stores
    Set<String> nonSideInputStoreNames = storageEngineFactories.keySet().stream().filter(storeName -> !sideInputStoreNames.contains(storeName)).collect(Collectors.toSet());
    // Obtain the checkpoints for each task
    Map<TaskName, Map<String, TaskRestoreManager>> taskRestoreManagers = new HashMap<>();
    Map<TaskName, Checkpoint> taskCheckpoints = new HashMap<>();
    containerModel.getTasks().forEach((taskName, taskModel) -> {
        Checkpoint taskCheckpoint = null;
        if (checkpointManager != null && activeTasks.contains(taskName)) {
            // only pass in checkpoints for active tasks
            taskCheckpoint = checkpointManager.readLastCheckpoint(taskName);
            LOG.info("Obtained checkpoint: {} for state restore for taskName: {}", taskCheckpoint, taskName);
        }
        taskCheckpoints.put(taskName, taskCheckpoint);
        Map<String, Set<String>> backendFactoryStoreNames = getBackendFactoryStoreNames(taskCheckpoint, nonSideInputStoreNames, new StorageConfig(config));
        Map<String, TaskRestoreManager> taskStoreRestoreManagers = createTaskRestoreManagers(restoreStateBackendFactories, backendFactoryStoreNames, clock, samzaContainerMetrics, taskName, taskModel);
        taskRestoreManagers.put(taskName, taskStoreRestoreManagers);
    });
    // Initialize each TaskStorageManager
    taskRestoreManagers.forEach((taskName, restoreManagers) -> restoreManagers.forEach((factoryName, taskRestoreManager) -> taskRestoreManager.init(taskCheckpoints.get(taskName))));
    // Start each store consumer once.
    // Note: These consumers are per system and only changelog system store consumers will be started.
    // Some TaskRestoreManagers may not require the consumer to to be started, but due to the agnostic nature of
    // ContainerStorageManager we always start the changelog consumer here in case it is required
    this.storeConsumers.values().stream().distinct().forEach(SystemConsumer::start);
    List<Future<Void>> taskRestoreFutures = new ArrayList<>();
    // Submit restore callable for each taskInstance
    taskRestoreManagers.forEach((taskInstance, restoreManagersMap) -> {
        // Submit for each restore factory
        restoreManagersMap.forEach((factoryName, taskRestoreManager) -> {
            long startTime = System.currentTimeMillis();
            String taskName = taskInstance.getTaskName();
            LOG.info("Starting restore for state for task: {}", taskName);
            CompletableFuture<Void> restoreFuture = taskRestoreManager.restore().handle((res, ex) -> {
                // on stop, so paralleling stop() also parallelizes their compaction (a time-intensive operation).
                try {
                    taskRestoreManager.close();
                } catch (Exception e) {
                    LOG.error("Error closing restore manager for task: {} after {} restore", taskName, ex != null ? "unsuccessful" : "successful", e);
                // ignore exception from close. container may still be be able to continue processing/backups
                // if restore manager close fails.
                }
                long timeToRestore = System.currentTimeMillis() - startTime;
                if (samzaContainerMetrics != null) {
                    Gauge taskGauge = samzaContainerMetrics.taskStoreRestorationMetrics().getOrDefault(taskInstance, null);
                    if (taskGauge != null) {
                        taskGauge.set(timeToRestore);
                    }
                }
                if (ex != null) {
                    // log and rethrow exception to communicate restore failure
                    String msg = String.format("Error restoring state for task: %s", taskName);
                    LOG.error(msg, ex);
                    // wrap in unchecked exception to throw from lambda
                    throw new SamzaException(msg, ex);
                } else {
                    return null;
                }
            });
            taskRestoreFutures.add(restoreFuture);
        });
    });
    // as samza exceptions
    for (Future<Void> future : taskRestoreFutures) {
        try {
            future.get();
        } catch (InterruptedException e) {
            LOG.warn("Received an interrupt during store restoration. Interrupting the restore executor to exit " + "prematurely without restoring full state.");
            restoreExecutor.shutdownNow();
            throw e;
        } catch (Exception e) {
            LOG.error("Exception when restoring state.", e);
            throw new SamzaException("Exception when restoring state.", e);
        }
    }
    // Stop each store consumer once
    this.storeConsumers.values().stream().distinct().forEach(SystemConsumer::stop);
    // Now create persistent non side input stores in read-write mode, leave non-persistent stores as-is
    this.taskStores = createTaskStores(nonSideInputStoreNames, this.containerModel, jobContext, containerContext, storageEngineFactories, serdes, taskInstanceMetrics, taskInstanceCollectors);
    // Add in memory stores
    this.inMemoryStores.forEach((taskName, stores) -> {
        if (!this.taskStores.containsKey(taskName)) {
            taskStores.put(taskName, new HashMap<>());
        }
        taskStores.get(taskName).putAll(stores);
    });
    // Add side input stores
    this.sideInputStores.forEach((taskName, stores) -> {
        if (!this.taskStores.containsKey(taskName)) {
            taskStores.put(taskName, new HashMap<>());
        }
        taskStores.get(taskName).putAll(stores);
    });
    LOG.info("Store Restore complete");
}
Also used : StreamMetadataCache(org.apache.samza.system.StreamMetadataCache) SerdeUtils(org.apache.samza.table.utils.SerdeUtils) LoggerFactory(org.slf4j.LoggerFactory) TaskModel(org.apache.samza.job.model.TaskModel) Future(java.util.concurrent.Future) SystemConsumer(org.apache.samza.system.SystemConsumer) SamzaContainerMetrics(org.apache.samza.container.SamzaContainerMetrics) Map(java.util.Map) TaskInstanceCollector(org.apache.samza.task.TaskInstanceCollector) RoundRobinChooserFactory(org.apache.samza.system.chooser.RoundRobinChooserFactory) Path(java.nio.file.Path) StorageConfig(org.apache.samza.config.StorageConfig) RunLoopTask(org.apache.samza.container.RunLoopTask) TaskName(org.apache.samza.container.TaskName) IncomingMessageEnvelope(org.apache.samza.system.IncomingMessageEnvelope) Collection(java.util.Collection) Set(java.util.Set) DefaultChooser(org.apache.samza.system.chooser.DefaultChooser) Checkpoint(org.apache.samza.checkpoint.Checkpoint) MetricsRegistry(org.apache.samza.metrics.MetricsRegistry) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) Optional(java.util.Optional) Config(org.apache.samza.config.Config) MetricsRegistryMap(org.apache.samza.metrics.MetricsRegistryMap) SystemAdmins(org.apache.samza.system.SystemAdmins) ScalaJavaUtil(org.apache.samza.util.ScalaJavaUtil) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) MessageChooser(org.apache.samza.system.chooser.MessageChooser) CheckpointV2(org.apache.samza.checkpoint.CheckpointV2) JobConfig(org.apache.samza.config.JobConfig) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Serde(org.apache.samza.serializers.Serde) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Function(java.util.function.Function) SystemStreamMetadata(org.apache.samza.system.SystemStreamMetadata) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Gauge(org.apache.samza.metrics.Gauge) ImmutableList(com.google.common.collect.ImmutableList) SerdeManager(org.apache.samza.serializers.SerdeManager) MessageCollector(org.apache.samza.task.MessageCollector) CheckpointManager(org.apache.samza.checkpoint.CheckpointManager) SystemStream(org.apache.samza.system.SystemStream) RunLoop(org.apache.samza.container.RunLoop) SystemConsumersMetrics(org.apache.samza.system.SystemConsumersMetrics) ExecutorService(java.util.concurrent.ExecutorService) MapUtils(org.apache.commons.collections4.MapUtils) JavaConversions(scala.collection.JavaConversions) TaskInstanceMetrics(org.apache.samza.container.TaskInstanceMetrics) Logger(org.slf4j.Logger) TaskConfig(org.apache.samza.config.TaskConfig) JobContext(org.apache.samza.context.JobContext) ContainerContext(org.apache.samza.context.ContainerContext) SystemFactory(org.apache.samza.system.SystemFactory) Clock(org.apache.samza.util.Clock) SystemConsumers(org.apache.samza.system.SystemConsumers) File(java.io.File) SamzaException(org.apache.samza.SamzaException) TimeUnit(java.util.concurrent.TimeUnit) TaskMode(org.apache.samza.job.model.TaskMode) Entry(org.apache.samza.storage.kv.Entry) ReflectionUtil(org.apache.samza.util.ReflectionUtil) ContainerModel(org.apache.samza.job.model.ContainerModel) VisibleForTesting(com.google.common.annotations.VisibleForTesting) KeyValueStore(org.apache.samza.storage.kv.KeyValueStore) Collections(java.util.Collections) SystemConsumer(org.apache.samza.system.SystemConsumer) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SamzaException(org.apache.samza.SamzaException) Gauge(org.apache.samza.metrics.Gauge) StorageConfig(org.apache.samza.config.StorageConfig) SamzaException(org.apache.samza.SamzaException) Checkpoint(org.apache.samza.checkpoint.Checkpoint) TaskName(org.apache.samza.container.TaskName) Future(java.util.concurrent.Future) CompletableFuture(java.util.concurrent.CompletableFuture) Map(java.util.Map) MetricsRegistryMap(org.apache.samza.metrics.MetricsRegistryMap) HashMap(java.util.HashMap)

Example 7 with CheckpointManager

use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.

the class TestTaskStorageCommitManager method testRemoveOldCheckpointsWhenBaseDirContainsRegularFiles.

@Test
public void testRemoveOldCheckpointsWhenBaseDirContainsRegularFiles() {
    ContainerStorageManager containerStorageManager = mock(ContainerStorageManager.class);
    CheckpointManager checkpointManager = mock(CheckpointManager.class);
    TaskBackupManager taskBackupManager1 = mock(TaskBackupManager.class);
    TaskBackupManager taskBackupManager2 = mock(TaskBackupManager.class);
    File durableStoreDir = mock(File.class);
    TaskInstanceMetrics metrics = mock(TaskInstanceMetrics.class);
    Timer checkpointTimer = mock(Timer.class);
    when(metrics.storeCheckpointNs()).thenReturn(checkpointTimer);
    StorageManagerUtil storageManagerUtil = mock(StorageManagerUtil.class);
    TaskName taskName = new TaskName("task1");
    Map<String, TaskBackupManager> backupManagers = ImmutableMap.of("factory1", taskBackupManager1, "factory2", taskBackupManager2);
    when(containerStorageManager.getAllStores(taskName)).thenReturn(Collections.emptyMap());
    TaskStorageCommitManager cm = new TaskStorageCommitManager(taskName, backupManagers, containerStorageManager, Collections.emptyMap(), new Partition(1), checkpointManager, new MapConfig(), ForkJoinPool.commonPool(), storageManagerUtil, durableStoreDir, metrics);
    File mockStoreDir = mock(File.class);
    String mockStoreDirName = "notDirectory";
    when(durableStoreDir.listFiles()).thenReturn(new File[] { mockStoreDir });
    when(mockStoreDir.getName()).thenReturn(mockStoreDirName);
    when(storageManagerUtil.getTaskStoreDir(eq(durableStoreDir), eq(mockStoreDirName), eq(taskName), eq(TaskMode.Active))).thenReturn(mockStoreDir);
    // null here can happen if listFiles is called on a non-directory
    when(mockStoreDir.listFiles(any(FileFilter.class))).thenReturn(null);
    cm.cleanUp(CheckpointId.create(), new HashMap<>()).join();
    verify(durableStoreDir).listFiles();
    verify(mockStoreDir).listFiles(any(FileFilter.class));
    verify(storageManagerUtil).getTaskStoreDir(eq(durableStoreDir), eq(mockStoreDirName), eq(taskName), eq(TaskMode.Active));
}
Also used : SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Partition(org.apache.samza.Partition) HashMap(java.util.HashMap) CheckpointManager(org.apache.samza.checkpoint.CheckpointManager) TaskInstanceMetrics(org.apache.samza.container.TaskInstanceMetrics) Timer(org.apache.samza.metrics.Timer) TaskName(org.apache.samza.container.TaskName) MapConfig(org.apache.samza.config.MapConfig) FileFilter(java.io.FileFilter) File(java.io.File) Test(org.junit.Test)

Example 8 with CheckpointManager

use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.

the class TestTaskStorageCommitManager method testSnapshotAndCommitAllFactories.

@Test
public void testSnapshotAndCommitAllFactories() {
    CheckpointManager checkpointManager = mock(CheckpointManager.class);
    TaskBackupManager taskBackupManager1 = mock(TaskBackupManager.class);
    TaskBackupManager taskBackupManager2 = mock(TaskBackupManager.class);
    ContainerStorageManager containerStorageManager = mock(ContainerStorageManager.class);
    Checkpoint checkpoint = mock(Checkpoint.class);
    TaskInstanceMetrics metrics = mock(TaskInstanceMetrics.class);
    Timer checkpointTimer = mock(Timer.class);
    when(metrics.storeCheckpointNs()).thenReturn(checkpointTimer);
    TaskName taskName = new TaskName("task1");
    Map<String, TaskBackupManager> backupManagers = ImmutableMap.of("factory1", taskBackupManager1, "factory2", taskBackupManager2);
    TaskStorageCommitManager cm = new TaskStorageCommitManager(taskName, backupManagers, containerStorageManager, Collections.emptyMap(), new Partition(1), checkpointManager, new MapConfig(), ForkJoinPool.commonPool(), new StorageManagerUtil(), null, metrics);
    when(checkpointManager.readLastCheckpoint(taskName)).thenReturn(checkpoint);
    cm.init();
    verify(taskBackupManager1).init(eq(checkpoint));
    verify(taskBackupManager2).init(eq(checkpoint));
    CheckpointId newCheckpointId = CheckpointId.create();
    Map<String, String> factory1Checkpoints = ImmutableMap.of("store1", "system;stream;1", "store2", "system;stream;2");
    Map<String, String> factory2Checkpoints = ImmutableMap.of("store1", "blobId1", "store2", "blobId2");
    when(containerStorageManager.getAllStores(taskName)).thenReturn(Collections.emptyMap());
    when(taskBackupManager1.snapshot(newCheckpointId)).thenReturn(factory1Checkpoints);
    when(taskBackupManager2.snapshot(newCheckpointId)).thenReturn(factory2Checkpoints);
    when(taskBackupManager1.upload(newCheckpointId, factory1Checkpoints)).thenReturn(CompletableFuture.completedFuture(factory1Checkpoints));
    when(taskBackupManager2.upload(newCheckpointId, factory2Checkpoints)).thenReturn(CompletableFuture.completedFuture(factory2Checkpoints));
    Map<String, Map<String, String>> snapshotSCMs = cm.snapshot(newCheckpointId);
    cm.upload(newCheckpointId, snapshotSCMs);
    // Test flow for snapshot
    verify(taskBackupManager1).snapshot(newCheckpointId);
    verify(taskBackupManager2).snapshot(newCheckpointId);
    // Test flow for upload
    verify(taskBackupManager1).upload(newCheckpointId, factory1Checkpoints);
    verify(taskBackupManager2).upload(newCheckpointId, factory2Checkpoints);
    verify(checkpointTimer).update(anyLong());
}
Also used : SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Partition(org.apache.samza.Partition) CheckpointManager(org.apache.samza.checkpoint.CheckpointManager) TaskInstanceMetrics(org.apache.samza.container.TaskInstanceMetrics) Checkpoint(org.apache.samza.checkpoint.Checkpoint) Timer(org.apache.samza.metrics.Timer) TaskName(org.apache.samza.container.TaskName) CheckpointId(org.apache.samza.checkpoint.CheckpointId) MapConfig(org.apache.samza.config.MapConfig) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Example 9 with CheckpointManager

use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.

the class TestTaskStorageCommitManager method testCleanupAllBackupManagers.

@Test
public void testCleanupAllBackupManagers() {
    CheckpointManager checkpointManager = mock(CheckpointManager.class);
    TaskBackupManager taskBackupManager1 = mock(TaskBackupManager.class);
    TaskBackupManager taskBackupManager2 = mock(TaskBackupManager.class);
    ContainerStorageManager containerStorageManager = mock(ContainerStorageManager.class);
    Checkpoint checkpoint = mock(Checkpoint.class);
    File durableStoreDir = mock(File.class);
    when(durableStoreDir.listFiles()).thenReturn(new File[0]);
    TaskInstanceMetrics metrics = mock(TaskInstanceMetrics.class);
    Timer checkpointTimer = mock(Timer.class);
    when(metrics.storeCheckpointNs()).thenReturn(checkpointTimer);
    TaskName taskName = new TaskName("task1");
    Map<String, TaskBackupManager> backupManagers = ImmutableMap.of("factory1", taskBackupManager1, "factory2", taskBackupManager2);
    TaskStorageCommitManager cm = new TaskStorageCommitManager(taskName, backupManagers, containerStorageManager, Collections.emptyMap(), new Partition(1), checkpointManager, new MapConfig(), ForkJoinPool.commonPool(), new StorageManagerUtil(), durableStoreDir, metrics);
    when(checkpointManager.readLastCheckpoint(taskName)).thenReturn(checkpoint);
    when(containerStorageManager.getAllStores(taskName)).thenReturn(Collections.emptyMap());
    when(taskBackupManager1.cleanUp(any(), any())).thenReturn(CompletableFuture.<Void>completedFuture(null));
    when(taskBackupManager2.cleanUp(any(), any())).thenReturn(CompletableFuture.<Void>completedFuture(null));
    Map<String, String> factory1Checkpoints = ImmutableMap.of("store1", "system;stream;1", "store2", "system;stream;2");
    Map<String, String> factory2Checkpoints = ImmutableMap.of("store1", "blobId1", "store2", "blobId2");
    Map<String, Map<String, String>> factoryCheckpointsMap = ImmutableMap.of("factory1", factory1Checkpoints, "factory2", factory2Checkpoints);
    when(taskBackupManager1.cleanUp(any(), any())).thenReturn(CompletableFuture.completedFuture(null));
    when(taskBackupManager2.cleanUp(any(), any())).thenReturn(CompletableFuture.completedFuture(null));
    CheckpointId newCheckpointId = CheckpointId.create();
    cm.cleanUp(newCheckpointId, factoryCheckpointsMap).join();
    verify(taskBackupManager1).cleanUp(newCheckpointId, factory1Checkpoints);
    verify(taskBackupManager2).cleanUp(newCheckpointId, factory2Checkpoints);
}
Also used : SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Partition(org.apache.samza.Partition) CheckpointManager(org.apache.samza.checkpoint.CheckpointManager) TaskInstanceMetrics(org.apache.samza.container.TaskInstanceMetrics) Checkpoint(org.apache.samza.checkpoint.Checkpoint) Timer(org.apache.samza.metrics.Timer) TaskName(org.apache.samza.container.TaskName) CheckpointId(org.apache.samza.checkpoint.CheckpointId) MapConfig(org.apache.samza.config.MapConfig) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Example 10 with CheckpointManager

use of org.apache.samza.checkpoint.CheckpointManager in project samza by apache.

the class TestTaskStorageCommitManager method testCleanupFailsIfBackupManagerNotInitiated.

@Test
public void testCleanupFailsIfBackupManagerNotInitiated() {
    CheckpointManager checkpointManager = mock(CheckpointManager.class);
    ContainerStorageManager containerStorageManager = mock(ContainerStorageManager.class);
    Checkpoint checkpoint = mock(Checkpoint.class);
    File durableStoreDir = mock(File.class);
    when(durableStoreDir.listFiles()).thenReturn(new File[0]);
    TaskInstanceMetrics metrics = mock(TaskInstanceMetrics.class);
    Timer checkpointTimer = mock(Timer.class);
    when(metrics.storeCheckpointNs()).thenReturn(checkpointTimer);
    TaskName taskName = new TaskName("task1");
    when(containerStorageManager.getAllStores(taskName)).thenReturn(Collections.emptyMap());
    TaskStorageCommitManager cm = new TaskStorageCommitManager(taskName, Collections.emptyMap(), containerStorageManager, Collections.emptyMap(), new Partition(1), checkpointManager, new MapConfig(), ForkJoinPool.commonPool(), new StorageManagerUtil(), durableStoreDir, metrics);
    when(checkpointManager.readLastCheckpoint(taskName)).thenReturn(checkpoint);
    Map<String, Map<String, String>> factoryCheckpointsMap = ImmutableMap.of(// factory 3 should be ignored
    "factory3", // factory 3 should be ignored
    Collections.emptyMap());
    CheckpointId newCheckpointId = CheckpointId.create();
    cm.cleanUp(newCheckpointId, factoryCheckpointsMap);
// should not fail the commit because the job should ignore any factories checkpoints not initialized
// in case the user is in a migration phase from on state backend to another
}
Also used : SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) Partition(org.apache.samza.Partition) CheckpointManager(org.apache.samza.checkpoint.CheckpointManager) TaskInstanceMetrics(org.apache.samza.container.TaskInstanceMetrics) Checkpoint(org.apache.samza.checkpoint.Checkpoint) Timer(org.apache.samza.metrics.Timer) TaskName(org.apache.samza.container.TaskName) CheckpointId(org.apache.samza.checkpoint.CheckpointId) MapConfig(org.apache.samza.config.MapConfig) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Aggregations

CheckpointManager (org.apache.samza.checkpoint.CheckpointManager)12 TaskName (org.apache.samza.container.TaskName)10 HashMap (java.util.HashMap)9 MapConfig (org.apache.samza.config.MapConfig)9 SystemStreamPartition (org.apache.samza.system.SystemStreamPartition)9 Map (java.util.Map)8 Partition (org.apache.samza.Partition)8 Test (org.junit.Test)8 TaskInstanceMetrics (org.apache.samza.container.TaskInstanceMetrics)7 Checkpoint (org.apache.samza.checkpoint.Checkpoint)6 Timer (org.apache.samza.metrics.Timer)6 ImmutableMap (com.google.common.collect.ImmutableMap)5 File (java.io.File)5 CheckpointId (org.apache.samza.checkpoint.CheckpointId)5 SystemAdmins (org.apache.samza.system.SystemAdmins)5 Config (org.apache.samza.config.Config)4 StorageConfig (org.apache.samza.config.StorageConfig)4 TaskConfig (org.apache.samza.config.TaskConfig)4 SamzaContainerMetrics (org.apache.samza.container.SamzaContainerMetrics)4 ContainerContext (org.apache.samza.context.ContainerContext)4