Search in sources :

Example 1 with DirIndex

use of org.apache.samza.storage.blobstore.index.DirIndex in project samza by apache.

the class BlobStoreUtil method removeTTL.

/**
 * Recursively mark all the blobs associated with the {@link DirIndex} to never expire (remove TTL).
 * @param dirIndex the {@link DirIndex} whose contents' TTL needs to be removed
 * @param metadata {@link Metadata} related to the request
 * @return A future that completes when all the blobs associated with this dirIndex are marked to
 * never expire.
 */
private CompletableFuture<Void> removeTTL(DirIndex dirIndex, Metadata metadata) {
    String dirName = dirIndex.getDirName();
    if (DirIndex.ROOT_DIR_NAME.equals(dirName)) {
        LOG.debug("Removing TTL for files and dirs present in DirIndex for root dir.");
    } else {
        LOG.debug("Removing TTL for files and dirs present in DirIndex for dir: {}", dirName);
    }
    List<CompletableFuture<Void>> updateTTLsFuture = new ArrayList<>();
    for (DirIndex subDir : dirIndex.getSubDirsPresent()) {
        updateTTLsFuture.add(removeTTL(subDir, metadata));
    }
    for (FileIndex file : dirIndex.getFilesPresent()) {
        Metadata requestMetadata = new Metadata(file.getFileName(), Optional.of(file.getFileMetadata().getSize()), metadata.getJobName(), metadata.getJobId(), metadata.getTaskName(), metadata.getStoreName());
        List<FileBlob> fileBlobs = file.getBlobs();
        for (FileBlob fileBlob : fileBlobs) {
            String opname = "removeTTL for fileBlob: " + file.getFileName() + " with blobId: {}" + fileBlob.getBlobId();
            Supplier<CompletionStage<Void>> ttlRemovalAction = () -> blobStoreManager.removeTTL(fileBlob.getBlobId(), requestMetadata).toCompletableFuture();
            CompletableFuture<Void> ttlRemovalFuture = FutureUtil.executeAsyncWithRetries(opname, ttlRemovalAction, isCauseNonRetriable(), executor);
            updateTTLsFuture.add(ttlRemovalFuture);
        }
    }
    return CompletableFuture.allOf(updateTTLsFuture.toArray(new CompletableFuture[0]));
}
Also used : FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) ArrayList(java.util.ArrayList) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) Metadata(org.apache.samza.storage.blobstore.Metadata) CompletableFuture(java.util.concurrent.CompletableFuture) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) CompletionStage(java.util.concurrent.CompletionStage)

Example 2 with DirIndex

use of org.apache.samza.storage.blobstore.index.DirIndex in project samza by apache.

the class BlobStoreUtil method restoreDir.

/**
 * Non-blocking restore of a {@link SnapshotIndex} to local store by downloading all the files and sub-dirs associated
 * with this remote snapshot.
 * @return A future that completes when all the async downloads completes
 */
public CompletableFuture<Void> restoreDir(File baseDir, DirIndex dirIndex, Metadata metadata) {
    LOG.debug("Restoring contents of directory: {} from remote snapshot.", baseDir);
    List<CompletableFuture<Void>> downloadFutures = new ArrayList<>();
    try {
        // create parent directories if they don't exist
        Files.createDirectories(baseDir.toPath());
    } catch (IOException exception) {
        LOG.error("Error creating directory: {} for restore", baseDir.getAbsolutePath(), exception);
        throw new SamzaException(String.format("Error creating directory: %s for restore", baseDir.getAbsolutePath()), exception);
    }
    // restore all files in the directory
    for (FileIndex fileIndex : dirIndex.getFilesPresent()) {
        File fileToRestore = Paths.get(baseDir.getAbsolutePath(), fileIndex.getFileName()).toFile();
        Metadata requestMetadata = new Metadata(fileToRestore.getAbsolutePath(), Optional.of(fileIndex.getFileMetadata().getSize()), metadata.getJobName(), metadata.getJobId(), metadata.getTaskName(), metadata.getStoreName());
        List<FileBlob> fileBlobs = fileIndex.getBlobs();
        String opName = "restoreFile: " + fileToRestore.getAbsolutePath();
        CompletableFuture<Void> fileRestoreFuture = FutureUtil.executeAsyncWithRetries(opName, () -> getFile(fileBlobs, fileToRestore, requestMetadata), isCauseNonRetriable(), executor);
        downloadFutures.add(fileRestoreFuture);
    }
    // restore any sub-directories
    List<DirIndex> subDirs = dirIndex.getSubDirsPresent();
    for (DirIndex subDir : subDirs) {
        File subDirFile = Paths.get(baseDir.getAbsolutePath(), subDir.getDirName()).toFile();
        downloadFutures.add(restoreDir(subDirFile, subDir, metadata));
    }
    return FutureUtil.allOf(downloadFutures);
}
Also used : FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) ArrayList(java.util.ArrayList) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) Metadata(org.apache.samza.storage.blobstore.Metadata) IOException(java.io.IOException) SamzaException(org.apache.samza.SamzaException) CompletableFuture(java.util.concurrent.CompletableFuture) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) File(java.io.File)

Example 3 with DirIndex

use of org.apache.samza.storage.blobstore.index.DirIndex in project samza by apache.

the class BlobStoreUtil method cleanUpDir.

/**
 * Recursively issue delete requests for files and dirs marked to be removed in a previously created remote snapshot.
 * Note: We do not immediately delete files/dirs to be removed when uploading a snapshot to the remote
 * store. We just track them for deletion during the upload, and delete them AFTER the snapshot is uploaded, and the
 * blob IDs have been persisted as part of the checkpoint. This is to prevent data loss if a failure happens
 * part way through the commit. We issue delete these file/subdirs in cleanUp() phase of commit lifecycle.
 * @param dirIndex the dir in the remote snapshot to clean up.
 * @param metadata Metadata related to the request
 * @return a future that completes when all the files and subdirs marked for deletion are cleaned up.
 */
public CompletionStage<Void> cleanUpDir(DirIndex dirIndex, Metadata metadata) {
    String dirName = dirIndex.getDirName();
    if (DirIndex.ROOT_DIR_NAME.equals(dirName)) {
        LOG.debug("Cleaning up root dir in blob store.");
    } else {
        LOG.debug("Cleaning up dir: {} in blob store.", dirIndex.getDirName());
    }
    List<CompletionStage<Void>> cleanUpFuture = new ArrayList<>();
    List<FileIndex> files = dirIndex.getFilesRemoved();
    for (FileIndex file : files) {
        Metadata requestMetadata = new Metadata(file.getFileName(), Optional.of(file.getFileMetadata().getSize()), metadata.getJobName(), metadata.getJobId(), metadata.getTaskName(), metadata.getStoreName());
        cleanUpFuture.add(deleteFile(file, requestMetadata));
    }
    for (DirIndex subDirToDelete : dirIndex.getSubDirsRemoved()) {
        // recursively delete ALL contents of the subDirToDelete.
        cleanUpFuture.add(deleteDir(subDirToDelete, metadata));
    }
    for (DirIndex subDirToRetain : dirIndex.getSubDirsPresent()) {
        // recursively clean up the subDir, only deleting files and subdirs marked for deletion.
        cleanUpFuture.add(cleanUpDir(subDirToRetain, metadata));
    }
    return CompletableFuture.allOf(cleanUpFuture.toArray(new CompletableFuture[0]));
}
Also used : FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayList(java.util.ArrayList) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) Metadata(org.apache.samza.storage.blobstore.Metadata) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) CompletionStage(java.util.concurrent.CompletionStage)

Example 4 with DirIndex

use of org.apache.samza.storage.blobstore.index.DirIndex in project samza by apache.

the class BlobStoreUtil method deleteDir.

/**
 * WARNING: Recursively delete **ALL** the associated files and subdirs within the provided {@link DirIndex}.
 * @param dirIndex {@link DirIndex} whose entire contents are to be deleted.
 * @param metadata {@link Metadata} related to the request
 * @return a future that completes when ALL the files and subdirs associated with the dirIndex have been
 * marked for deleted in the remote blob store.
 */
public CompletionStage<Void> deleteDir(DirIndex dirIndex, Metadata metadata) {
    LOG.debug("Completely deleting dir: {} in blob store", dirIndex.getDirName());
    List<CompletionStage<Void>> deleteFutures = new ArrayList<>();
    // Delete all files present in subDir
    for (FileIndex file : dirIndex.getFilesPresent()) {
        Metadata requestMetadata = new Metadata(file.getFileName(), Optional.of(file.getFileMetadata().getSize()), metadata.getJobName(), metadata.getJobId(), metadata.getTaskName(), metadata.getStoreName());
        deleteFutures.add(deleteFile(file, requestMetadata));
    }
    // Delete all subDirs present recursively
    for (DirIndex subDir : dirIndex.getSubDirsPresent()) {
        deleteFutures.add(deleteDir(subDir, metadata));
    }
    return CompletableFuture.allOf(deleteFutures.toArray(new CompletableFuture[0]));
}
Also used : FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayList(java.util.ArrayList) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) Metadata(org.apache.samza.storage.blobstore.Metadata) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) CompletionStage(java.util.concurrent.CompletionStage)

Example 5 with DirIndex

use of org.apache.samza.storage.blobstore.index.DirIndex in project samza by apache.

the class DirDiffUtil method getDirDiff.

private static DirDiff getDirDiff(File localSnapshotDir, DirIndex remoteSnapshotDir, BiPredicate<File, FileIndex> areSameFile, boolean isRootDir) {
    Preconditions.checkState(localSnapshotDir != null && localSnapshotDir.isDirectory());
    Preconditions.checkNotNull(remoteSnapshotDir);
    LOG.debug("Creating DirDiff between local dir: {} and remote dir: {}", localSnapshotDir.getPath(), remoteSnapshotDir.getDirName());
    List<DirDiff> subDirsAdded = new ArrayList<>();
    List<DirDiff> subDirsRetained = new ArrayList<>();
    List<DirIndex> subDirsRemoved = new ArrayList<>();
    // list files returns empty list if local snapshot directory is empty
    List<File> localSnapshotFiles = Arrays.asList(Objects.requireNonNull(localSnapshotDir.listFiles(File::isFile)));
    List<FileIndex> remoteSnapshotFiles = remoteSnapshotDir.getFilesPresent();
    // list files returns empty list if local snapshot directory is empty
    List<File> localSnapshotSubDirs = Arrays.asList(Objects.requireNonNull(localSnapshotDir.listFiles(File::isDirectory)));
    Set<String> localSnapshotSubDirNames = localSnapshotSubDirs.stream().map(File::getName).collect(Collectors.toCollection(HashSet::new));
    List<DirIndex> remoteSnapshotSubDirs = remoteSnapshotDir.getSubDirsPresent();
    Set<String> remoteSnapshotSubDirNames = remoteSnapshotSubDirs.stream().map(DirIndex::getDirName).collect(Collectors.toCollection(HashSet::new));
    // TODO MED shesharm: this compares each file in directory 3 times. Categorize files in one traversal instead.
    List<File> filesToUpload = getNewFilesToUpload(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
    List<FileIndex> filesToRetain = getFilesToRetain(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
    List<FileIndex> filesToRemove = getFilesToRemove(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
    for (File localSnapshotSubDir : localSnapshotSubDirs) {
        if (!remoteSnapshotSubDirNames.contains(localSnapshotSubDir.getName())) {
            LOG.debug("Subdir {} present in local snapshot but not in remote snapshot. " + "Recursively adding subdir contents.", localSnapshotSubDir.getPath());
            subDirsAdded.add(getDiffForNewDir(localSnapshotSubDir));
        } else {
            LOG.debug("Subdir {} present in local snapshot and in remote snapshot. " + "Recursively comparing local and remote subdirs.", localSnapshotSubDir.getPath());
            DirIndex remoteSubDirIndex = remoteSnapshotSubDirs.stream().filter(indexBlob -> indexBlob.getDirName().equals(localSnapshotSubDir.getName())).findFirst().get();
            subDirsRetained.add(getDirDiff(localSnapshotSubDir, remoteSubDirIndex, areSameFile, false));
        }
    }
    // 3. Subdir in remote snapshot but not in local snapshot
    for (DirIndex remoteSnapshotSubDir : remoteSnapshotSubDirs) {
        if (!localSnapshotSubDirNames.contains(remoteSnapshotSubDir.getDirName())) {
            LOG.debug("Subdir {} present in remote snapshot but not in local snapshot. " + "Marking for removal from remote snapshot. ", remoteSnapshotDir.getDirName());
            subDirsRemoved.add(remoteSnapshotSubDir);
        }
    }
    String dirName = isRootDir ? DirIndex.ROOT_DIR_NAME : localSnapshotDir.getName();
    return new DirDiff(dirName, filesToUpload, filesToRetain, filesToRemove, subDirsAdded, subDirsRetained, subDirsRemoved);
}
Also used : FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) ArrayList(java.util.ArrayList) DirDiff(org.apache.samza.storage.blobstore.diff.DirDiff) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) File(java.io.File)

Aggregations

DirIndex (org.apache.samza.storage.blobstore.index.DirIndex)39 Path (java.nio.file.Path)29 SnapshotMetadata (org.apache.samza.storage.blobstore.index.SnapshotMetadata)27 Test (org.junit.Test)26 File (java.io.File)25 SnapshotIndex (org.apache.samza.storage.blobstore.index.SnapshotIndex)25 ArrayList (java.util.ArrayList)23 Pair (org.apache.commons.lang3.tuple.Pair)23 CompletableFuture (java.util.concurrent.CompletableFuture)21 CompletionStage (java.util.concurrent.CompletionStage)20 CheckpointId (org.apache.samza.checkpoint.CheckpointId)20 SamzaException (org.apache.samza.SamzaException)19 DirDiff (org.apache.samza.storage.blobstore.diff.DirDiff)19 IOException (java.io.IOException)18 HashMap (java.util.HashMap)18 Checkpoint (org.apache.samza.checkpoint.Checkpoint)17 Files (java.nio.file.Files)16 List (java.util.List)16 Map (java.util.Map)16 Optional (java.util.Optional)16