use of org.apache.samza.storage.blobstore.diff.DirDiff in project samza by apache.
the class DirDiffUtil method getDirDiff.
private static DirDiff getDirDiff(File localSnapshotDir, DirIndex remoteSnapshotDir, BiPredicate<File, FileIndex> areSameFile, boolean isRootDir) {
Preconditions.checkState(localSnapshotDir != null && localSnapshotDir.isDirectory());
Preconditions.checkNotNull(remoteSnapshotDir);
LOG.debug("Creating DirDiff between local dir: {} and remote dir: {}", localSnapshotDir.getPath(), remoteSnapshotDir.getDirName());
List<DirDiff> subDirsAdded = new ArrayList<>();
List<DirDiff> subDirsRetained = new ArrayList<>();
List<DirIndex> subDirsRemoved = new ArrayList<>();
// list files returns empty list if local snapshot directory is empty
List<File> localSnapshotFiles = Arrays.asList(Objects.requireNonNull(localSnapshotDir.listFiles(File::isFile)));
List<FileIndex> remoteSnapshotFiles = remoteSnapshotDir.getFilesPresent();
// list files returns empty list if local snapshot directory is empty
List<File> localSnapshotSubDirs = Arrays.asList(Objects.requireNonNull(localSnapshotDir.listFiles(File::isDirectory)));
Set<String> localSnapshotSubDirNames = localSnapshotSubDirs.stream().map(File::getName).collect(Collectors.toCollection(HashSet::new));
List<DirIndex> remoteSnapshotSubDirs = remoteSnapshotDir.getSubDirsPresent();
Set<String> remoteSnapshotSubDirNames = remoteSnapshotSubDirs.stream().map(DirIndex::getDirName).collect(Collectors.toCollection(HashSet::new));
// TODO MED shesharm: this compares each file in directory 3 times. Categorize files in one traversal instead.
List<File> filesToUpload = getNewFilesToUpload(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
List<FileIndex> filesToRetain = getFilesToRetain(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
List<FileIndex> filesToRemove = getFilesToRemove(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
for (File localSnapshotSubDir : localSnapshotSubDirs) {
if (!remoteSnapshotSubDirNames.contains(localSnapshotSubDir.getName())) {
LOG.debug("Subdir {} present in local snapshot but not in remote snapshot. " + "Recursively adding subdir contents.", localSnapshotSubDir.getPath());
subDirsAdded.add(getDiffForNewDir(localSnapshotSubDir));
} else {
LOG.debug("Subdir {} present in local snapshot and in remote snapshot. " + "Recursively comparing local and remote subdirs.", localSnapshotSubDir.getPath());
DirIndex remoteSubDirIndex = remoteSnapshotSubDirs.stream().filter(indexBlob -> indexBlob.getDirName().equals(localSnapshotSubDir.getName())).findFirst().get();
subDirsRetained.add(getDirDiff(localSnapshotSubDir, remoteSubDirIndex, areSameFile, false));
}
}
// 3. Subdir in remote snapshot but not in local snapshot
for (DirIndex remoteSnapshotSubDir : remoteSnapshotSubDirs) {
if (!localSnapshotSubDirNames.contains(remoteSnapshotSubDir.getDirName())) {
LOG.debug("Subdir {} present in remote snapshot but not in local snapshot. " + "Marking for removal from remote snapshot. ", remoteSnapshotDir.getDirName());
subDirsRemoved.add(remoteSnapshotSubDir);
}
}
String dirName = isRootDir ? DirIndex.ROOT_DIR_NAME : localSnapshotDir.getName();
return new DirDiff(dirName, filesToUpload, filesToRetain, filesToRemove, subDirsAdded, subDirsRetained, subDirsRemoved);
}
use of org.apache.samza.storage.blobstore.diff.DirDiff in project samza by apache.
the class BlobStoreBackupManager method upload.
@Override
public CompletableFuture<Map<String, String>> upload(CheckpointId checkpointId, Map<String, String> storeSCMs) {
long uploadStartTime = System.nanoTime();
// reset gauges for each upload
metrics.filesToUpload.getValue().set(0L);
metrics.bytesToUpload.getValue().set(0L);
metrics.filesUploaded.getValue().set(0L);
metrics.bytesUploaded.getValue().set(0L);
metrics.filesRemaining.getValue().set(0L);
metrics.bytesRemaining.getValue().set(0L);
metrics.filesToRetain.getValue().set(0L);
metrics.bytesToRetain.getValue().set(0L);
// This map is used to atomically replace the prevStoreSnapshotIndexesFuture map at the end of the task commit
Map<String, CompletableFuture<Pair<String, SnapshotIndex>>> storeToSCMAndSnapshotIndexPairFutures = new HashMap<>();
// This map is used to return serialized State Checkpoint Markers to the caller
Map<String, CompletableFuture<String>> storeToSerializedSCMFuture = new HashMap<>();
storesToBackup.forEach((storeName) -> {
long storeUploadStartTime = System.nanoTime();
try {
// metadata for the current store snapshot to upload
SnapshotMetadata snapshotMetadata = new SnapshotMetadata(checkpointId, jobName, jobId, taskName, storeName);
// get the local store dir corresponding to the current checkpointId
File storeDir = storageManagerUtil.getTaskStoreDir(loggedStoreBaseDir, storeName, taskModel.getTaskName(), taskModel.getTaskMode());
String checkpointDirPath = storageManagerUtil.getStoreCheckpointDir(storeDir, checkpointId);
File checkpointDir = new File(checkpointDirPath);
LOG.debug("Got task: {} store: {} storeDir: {} and checkpointDir: {}", taskName, storeName, storeDir, checkpointDir);
// guaranteed to be available since a new task commit may not start until the previous one is complete
Map<String, Pair<String, SnapshotIndex>> prevStoreSnapshotIndexes = prevStoreSnapshotIndexesFuture.get(0, TimeUnit.MILLISECONDS);
// get the previous store directory contents
DirIndex prevDirIndex;
if (prevStoreSnapshotIndexes.containsKey(storeName)) {
prevDirIndex = prevStoreSnapshotIndexes.get(storeName).getRight().getDirIndex();
} else {
// no previous SnapshotIndex means that this is the first commit for this store. Create an empty DirIndex.
prevDirIndex = new DirIndex(checkpointDir.getName(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList());
}
long dirDiffStartTime = System.nanoTime();
// get the diff between previous and current store directories
DirDiff dirDiff = DirDiffUtil.getDirDiff(checkpointDir, prevDirIndex, DirDiffUtil.areSameFile(false));
metrics.storeDirDiffNs.get(storeName).update(System.nanoTime() - dirDiffStartTime);
DirDiff.Stats stats = DirDiff.getStats(dirDiff);
updateStoreDiffMetrics(storeName, stats);
metrics.filesToUpload.getValue().addAndGet(stats.filesAdded);
metrics.bytesToUpload.getValue().addAndGet(stats.bytesAdded);
// Note: FilesRemaining metric is set to FilesAdded in the beginning of the current upload and then counted down
// for each upload.
metrics.filesRemaining.getValue().addAndGet(stats.filesAdded);
metrics.bytesRemaining.getValue().addAndGet(stats.bytesAdded);
metrics.filesToRetain.getValue().addAndGet(stats.filesRetained);
metrics.bytesToRetain.getValue().addAndGet(stats.bytesRetained);
// upload the diff to the blob store and get the new directory index
CompletionStage<DirIndex> dirIndexFuture = blobStoreUtil.putDir(dirDiff, snapshotMetadata);
CompletionStage<SnapshotIndex> snapshotIndexFuture = dirIndexFuture.thenApplyAsync(dirIndex -> {
LOG.trace("Dir upload complete. Returning new SnapshotIndex for task: {} store: {}.", taskName, storeName);
Optional<String> prevSnapshotIndexBlobId = Optional.ofNullable(prevStoreSnapshotIndexes.get(storeName)).map(Pair::getLeft);
return new SnapshotIndex(clock.currentTimeMillis(), snapshotMetadata, dirIndex, prevSnapshotIndexBlobId);
}, executor);
// upload the new snapshot index to the blob store and get its blob id
CompletionStage<String> snapshotIndexBlobIdFuture = snapshotIndexFuture.thenComposeAsync(si -> {
LOG.trace("Uploading Snapshot index for task: {} store: {}", taskName, storeName);
return blobStoreUtil.putSnapshotIndex(si);
}, executor);
// save store name and it's SnapshotIndex blob id and SnapshotIndex pair. At the end of the upload, atomically
// update previous snapshot index map with this.
CompletableFuture<Pair<String, SnapshotIndex>> scmAndSnapshotIndexPairFuture = FutureUtil.toFutureOfPair(Pair.of(snapshotIndexBlobIdFuture.toCompletableFuture(), snapshotIndexFuture.toCompletableFuture()));
scmAndSnapshotIndexPairFuture.whenComplete((res, ex) -> {
long uploadTimeNs = System.nanoTime() - storeUploadStartTime;
metrics.storeUploadNs.get(storeName).update(uploadTimeNs);
});
storeToSCMAndSnapshotIndexPairFutures.put(storeName, scmAndSnapshotIndexPairFuture);
storeToSerializedSCMFuture.put(storeName, snapshotIndexBlobIdFuture.toCompletableFuture());
} catch (Exception e) {
throw new SamzaException(String.format("Error uploading store snapshot to blob store for task: %s, store: %s, checkpointId: %s", taskName, storeName, checkpointId), e);
}
});
// replace the previous storeName to snapshot index mapping with the new mapping.
this.prevStoreSnapshotIndexesFuture = FutureUtil.toFutureOfMap(storeToSCMAndSnapshotIndexPairFutures);
return FutureUtil.toFutureOfMap(storeToSerializedSCMFuture).whenComplete((res, ex) -> metrics.uploadNs.update(System.nanoTime() - uploadStartTime));
}
use of org.apache.samza.storage.blobstore.diff.DirDiff in project samza by apache.
the class TestBlobStoreUtil method testCleanup.
@Test
public void testCleanup() throws IOException, ExecutionException, InterruptedException {
BlobStoreManager blobStoreManager = mock(BlobStoreManager.class);
// File, dir and recursive dir added, retained and removed in local
// Using unique file names since test util uses only the file name (leaf node)
// as the mock blob id, not the full file path.
String local = "[a, c, z/1, y/2, p/m/3, q/n/4]";
String remote = "[a, b, z/1, x/5, p/m/3, r/o/6]";
String expectedRemoved = "[b, 5, 6]";
// keep only the last character (the file name).
SortedSet<String> expectedRemovedFiles = BlobStoreTestUtil.getExpected(expectedRemoved);
// Set up environment
Path localSnapshotDir = BlobStoreTestUtil.createLocalDir(local);
String basePath = localSnapshotDir.toAbsolutePath().toString();
DirIndex remoteSnapshotDir = BlobStoreTestUtil.createDirIndex(remote);
SnapshotMetadata snapshotMetadata = new SnapshotMetadata(checkpointId, jobName, jobId, taskName, storeName);
DirDiff dirDiff = DirDiffUtil.getDirDiff(localSnapshotDir.toFile(), remoteSnapshotDir, (localFile, remoteFile) -> localFile.getName().equals(remoteFile.getFileName()));
BlobStoreUtil blobStoreUtil = new BlobStoreUtil(blobStoreManager, EXECUTOR, null, null);
when(blobStoreManager.put(any(InputStream.class), any(Metadata.class))).thenReturn(CompletableFuture.completedFuture("blobId"));
CompletionStage<DirIndex> dirIndexFuture = blobStoreUtil.putDir(dirDiff, snapshotMetadata);
DirIndex dirIndex = null;
try {
// should be already complete. if not, future composition in putDir is broken.
dirIndex = dirIndexFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
} catch (TimeoutException e) {
fail("Future returned from putDir should be already complete.");
}
// Set up mocks
SortedSet<String> allDeleted = new TreeSet<>();
when(blobStoreManager.delete(anyString(), any(Metadata.class))).thenAnswer((Answer<CompletableFuture<Void>>) invocation -> {
String blobId = invocation.getArgumentAt(0, String.class);
allDeleted.add(blobId);
return CompletableFuture.completedFuture(null);
});
// Execute
CompletionStage<Void> cleanUpFuture = blobStoreUtil.cleanUpDir(dirIndex, metadata);
try {
// should be already complete. if not, future composition in putDir is broken.
cleanUpFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
} catch (TimeoutException e) {
fail("Future returned from putDir should be already complete.");
}
// Assert
assertEquals(expectedRemovedFiles, allDeleted);
}
use of org.apache.samza.storage.blobstore.diff.DirDiff in project samza by apache.
the class TestBlobStoreUtil method testPutDirFailsIfAnyFileUploadFails.
@Test
public void testPutDirFailsIfAnyFileUploadFails() throws IOException, TimeoutException, InterruptedException {
BlobStoreManager blobStoreManager = mock(BlobStoreManager.class);
// File, dir and recursive dir added, retained and removed in local
String local = "[a, b]";
String remote = "[]";
// Set up environment
Path localSnapshotDir = BlobStoreTestUtil.createLocalDir(local);
String basePath = localSnapshotDir.toAbsolutePath().toString();
DirIndex remoteSnapshotDir = BlobStoreTestUtil.createDirIndex(remote);
SnapshotMetadata snapshotMetadata = new SnapshotMetadata(checkpointId, jobName, jobId, taskName, storeName);
DirDiff dirDiff = DirDiffUtil.getDirDiff(localSnapshotDir.toFile(), remoteSnapshotDir, (localFile, remoteFile) -> localFile.getName().equals(remoteFile.getFileName()));
// Set up mocks
SamzaException exception = new SamzaException("Error uploading file");
CompletableFuture<String> failedFuture = new CompletableFuture<>();
failedFuture.completeExceptionally(exception);
when(blobStoreManager.put(any(InputStream.class), any(Metadata.class))).thenAnswer((Answer<CompletableFuture<String>>) invocation -> {
Metadata metadata = invocation.getArgumentAt(1, Metadata.class);
String path = metadata.getPayloadPath();
if (path.endsWith("a")) {
return CompletableFuture.completedFuture("aBlobId");
} else {
return failedFuture;
}
});
// Execute
BlobStoreUtil blobStoreUtil = new BlobStoreUtil(blobStoreManager, EXECUTOR, null, null);
CompletionStage<DirIndex> dirIndexFuture = blobStoreUtil.putDir(dirDiff, snapshotMetadata);
try {
// should be already complete. if not, future composition in putDir is broken.
dirIndexFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
} catch (ExecutionException e) {
Throwable cause = e.getCause();
// Assert that the result future fails and that the cause is propagated correctly
assertEquals(exception, cause);
return;
}
fail("DirIndex future should have been completed with an exception");
}
use of org.apache.samza.storage.blobstore.diff.DirDiff in project samza by apache.
the class TestBlobStoreUtil method testCleanUpFailsIfAnyFileDeleteFails.
@Test
public void testCleanUpFailsIfAnyFileDeleteFails() throws IOException, TimeoutException, InterruptedException, ExecutionException {
BlobStoreManager blobStoreManager = mock(BlobStoreManager.class);
// File, dir and recursive dir added, retained and removed in local
// Using unique file names since test util uses only the file name (leaf node)
// as the mock blob id, not the full file path.
String local = "[a, b]";
String remote = "[c, d]";
// Set up environment
Path localSnapshotDir = BlobStoreTestUtil.createLocalDir(local);
String basePath = localSnapshotDir.toAbsolutePath().toString();
DirIndex remoteSnapshotDir = BlobStoreTestUtil.createDirIndex(remote);
SnapshotMetadata snapshotMetadata = new SnapshotMetadata(checkpointId, jobName, jobId, taskName, storeName);
DirDiff dirDiff = DirDiffUtil.getDirDiff(localSnapshotDir.toFile(), remoteSnapshotDir, (localFile, remoteFile) -> localFile.getName().equals(remoteFile.getFileName()));
BlobStoreUtil blobStoreUtil = new BlobStoreUtil(blobStoreManager, EXECUTOR, null, null);
when(blobStoreManager.put(any(InputStream.class), any(Metadata.class))).thenReturn(CompletableFuture.completedFuture("blobId"));
CompletionStage<DirIndex> dirIndexFuture = blobStoreUtil.putDir(dirDiff, snapshotMetadata);
DirIndex dirIndex = null;
try {
// should be already complete. if not, future composition in putDir is broken.
dirIndex = dirIndexFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
} catch (TimeoutException e) {
fail("Future returned from putDir should be already complete.");
}
// Set up mocks
SamzaException exception = new SamzaException("Error deleting file");
CompletableFuture<Void> failedFuture = new CompletableFuture<>();
failedFuture.completeExceptionally(exception);
when(blobStoreManager.delete(anyString(), any(Metadata.class))).thenAnswer((Answer<CompletableFuture<Void>>) invocation -> {
String blobId = invocation.getArgumentAt(0, String.class);
if (blobId.equals("c")) {
return CompletableFuture.completedFuture(null);
} else {
return failedFuture;
}
});
// Execute
CompletionStage<Void> cleanUpFuture = blobStoreUtil.cleanUpDir(dirIndex, metadata);
try {
// should be already complete. if not, future composition in putDir is broken.
cleanUpFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
} catch (ExecutionException e) {
Throwable cause = e.getCause();
// Assert that the result future fails and that the cause is propagated correctly
assertEquals(exception, cause);
return;
}
fail("Clean up future should have been completed with an exception");
}
Aggregations