Search in sources :

Example 11 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class BlobStoreUtil method putDir.

/**
 * Recursively upload all new files and upload or update contents of all subdirs in the {@link DirDiff} and return a
 * Future containing the {@link DirIndex} associated with the directory.
 * @param dirDiff diff for the contents of this directory
 * @return A future with the {@link DirIndex} if the upload completed successfully.
 */
public CompletionStage<DirIndex> putDir(DirDiff dirDiff, SnapshotMetadata snapshotMetadata) {
    // Upload all new files in the dir
    List<File> filesToUpload = dirDiff.getFilesAdded();
    List<CompletionStage<FileIndex>> fileFutures = filesToUpload.stream().map(file -> putFile(file, snapshotMetadata)).collect(Collectors.toList());
    CompletableFuture<Void> allFilesFuture = CompletableFuture.allOf(fileFutures.toArray(new CompletableFuture[0]));
    List<CompletionStage<DirIndex>> subDirFutures = new ArrayList<>();
    // recursively upload all new subdirs of this dir
    for (DirDiff subDirAdded : dirDiff.getSubDirsAdded()) {
        subDirFutures.add(putDir(subDirAdded, snapshotMetadata));
    }
    // recursively update contents of all subdirs that are retained but might have been modified
    for (DirDiff subDirRetained : dirDiff.getSubDirsRetained()) {
        subDirFutures.add(putDir(subDirRetained, snapshotMetadata));
    }
    CompletableFuture<Void> allDirBlobsFuture = CompletableFuture.allOf(subDirFutures.toArray(new CompletableFuture[0]));
    return CompletableFuture.allOf(allDirBlobsFuture, allFilesFuture).thenApplyAsync(f -> {
        LOG.trace("All file and dir uploads complete for task: {} store: {}", snapshotMetadata.getTaskName(), snapshotMetadata.getStoreName());
        List<FileIndex> filesPresent = fileFutures.stream().map(blob -> blob.toCompletableFuture().join()).collect(Collectors.toList());
        filesPresent.addAll(dirDiff.getFilesRetained());
        List<DirIndex> subDirsPresent = subDirFutures.stream().map(subDir -> subDir.toCompletableFuture().join()).collect(Collectors.toList());
        LOG.debug("Uploaded diff for task: {} store: {} with statistics: {}", snapshotMetadata.getTaskName(), snapshotMetadata.getStoreName(), DirDiff.getStats(dirDiff));
        LOG.trace("Returning new DirIndex for task: {} store: {}", snapshotMetadata.getTaskName(), snapshotMetadata.getStoreName());
        return new DirIndex(dirDiff.getDirName(), filesPresent, dirDiff.getFilesRemoved(), subDirsPresent, dirDiff.getSubDirsRemoved());
    }, executor);
}
Also used : CheckedInputStream(java.util.zip.CheckedInputStream) BlobStoreRestoreManagerMetrics(org.apache.samza.storage.blobstore.metrics.BlobStoreRestoreManagerMetrics) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) LoggerFactory(org.slf4j.LoggerFactory) RetriableException(org.apache.samza.storage.blobstore.exceptions.RetriableException) StringUtils(org.apache.commons.lang3.StringUtils) SnapshotIndexSerde(org.apache.samza.storage.blobstore.index.serde.SnapshotIndexSerde) ByteArrayInputStream(java.io.ByteArrayInputStream) Pair(org.apache.commons.lang3.tuple.Pair) Map(java.util.Map) FutureUtil(org.apache.samza.util.FutureUtil) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) Checkpoint(org.apache.samza.checkpoint.Checkpoint) Collectors(java.util.stream.Collectors) DirDiff(org.apache.samza.storage.blobstore.diff.DirDiff) List(java.util.List) CompletionStage(java.util.concurrent.CompletionStage) SnapshotIndex(org.apache.samza.storage.blobstore.index.SnapshotIndex) BlobStoreBackupManagerMetrics(org.apache.samza.storage.blobstore.metrics.BlobStoreBackupManagerMetrics) Optional(java.util.Optional) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) CheckpointV2(org.apache.samza.checkpoint.CheckpointV2) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) BlobStoreManager(org.apache.samza.storage.blobstore.BlobStoreManager) BlobStoreStateBackendFactory(org.apache.samza.storage.blobstore.BlobStoreStateBackendFactory) ExecutorService(java.util.concurrent.ExecutorService) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) Logger(org.slf4j.Logger) Files(java.nio.file.Files) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) Metadata(org.apache.samza.storage.blobstore.Metadata) File(java.io.File) SamzaException(org.apache.samza.SamzaException) Paths(java.nio.file.Paths) CRC32(java.util.zip.CRC32) Preconditions(com.google.common.base.Preconditions) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) Collections(java.util.Collections) InputStream(java.io.InputStream) DeletedException(org.apache.samza.storage.blobstore.exceptions.DeletedException) ArrayList(java.util.ArrayList) DirDiff(org.apache.samza.storage.blobstore.diff.DirDiff) CompletableFuture(java.util.concurrent.CompletableFuture) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) File(java.io.File) CompletionStage(java.util.concurrent.CompletionStage)

Example 12 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class BlobStoreUtil method getFile.

/**
 * Gets a file from the blob store.
 * @param fileBlobs List of {@link FileBlob}s that constitute this file.
 * @param fileToRestore File pointing to the local path where the file will be restored.
 * @param requestMetadata {@link Metadata} associated with this request
 * @return a future that completes when the file is downloaded and written or if an exception occurs.
 */
@VisibleForTesting
CompletableFuture<Void> getFile(List<FileBlob> fileBlobs, File fileToRestore, Metadata requestMetadata) {
    FileOutputStream outputStream = null;
    try {
        long restoreFileStartTime = System.nanoTime();
        if (fileToRestore.exists()) {
            // delete the file if it already exists, e.g. from a previous retry.
            Files.delete(fileToRestore.toPath());
        }
        outputStream = new FileOutputStream(fileToRestore);
        final FileOutputStream finalOutputStream = outputStream;
        // TODO HIGH shesharm add integration tests to ensure empty files and directories are handled correctly E2E.
        // create file for 0 byte files (fileIndex entry but no fileBlobs).
        fileToRestore.createNewFile();
        // create a copy to ensure list being sorted is mutable.
        List<FileBlob> fileBlobsCopy = new ArrayList<>(fileBlobs);
        // sort by offset.
        fileBlobsCopy.sort(Comparator.comparingInt(FileBlob::getOffset));
        // chain the futures such that write to file for blobs is sequential.
        // can be optimized to write concurrently to the file later.
        CompletableFuture<Void> resultFuture = CompletableFuture.completedFuture(null);
        for (FileBlob fileBlob : fileBlobsCopy) {
            resultFuture = resultFuture.thenComposeAsync(v -> {
                LOG.debug("Starting restore for file: {} with blob id: {} at offset: {}", fileToRestore, fileBlob.getBlobId(), fileBlob.getOffset());
                return blobStoreManager.get(fileBlob.getBlobId(), finalOutputStream, requestMetadata);
            }, executor);
        }
        resultFuture = resultFuture.thenRunAsync(() -> {
            LOG.debug("Finished restore for file: {}. Closing output stream.", fileToRestore);
            try {
                // flush the file contents to disk
                finalOutputStream.getFD().sync();
                finalOutputStream.close();
            } catch (Exception e) {
                throw new SamzaException(String.format("Error closing output stream for file: %s", fileToRestore.getAbsolutePath()), e);
            }
        }, executor);
        resultFuture.whenComplete((res, ex) -> {
            if (restoreMetrics != null) {
                restoreMetrics.avgFileRestoreNs.update(System.nanoTime() - restoreFileStartTime);
                long fileSize = requestMetadata.getPayloadSize();
                restoreMetrics.restoreRate.inc(fileSize);
                restoreMetrics.filesRestored.getValue().addAndGet(1);
                restoreMetrics.bytesRestored.getValue().addAndGet(fileSize);
                restoreMetrics.filesRemaining.getValue().addAndGet(-1);
                restoreMetrics.bytesRemaining.getValue().addAndGet(-1 * fileSize);
            }
        });
        return resultFuture;
    } catch (Exception exception) {
        try {
            if (outputStream != null) {
                outputStream.close();
            }
        } catch (Exception err) {
            LOG.error("Error closing output stream for file: {}", fileToRestore.getAbsolutePath(), err);
        }
        throw new SamzaException(String.format("Error restoring file: %s in path: %s", fileToRestore.getName(), requestMetadata.getPayloadPath()), exception);
    }
}
Also used : CheckedInputStream(java.util.zip.CheckedInputStream) BlobStoreRestoreManagerMetrics(org.apache.samza.storage.blobstore.metrics.BlobStoreRestoreManagerMetrics) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) LoggerFactory(org.slf4j.LoggerFactory) RetriableException(org.apache.samza.storage.blobstore.exceptions.RetriableException) StringUtils(org.apache.commons.lang3.StringUtils) SnapshotIndexSerde(org.apache.samza.storage.blobstore.index.serde.SnapshotIndexSerde) ByteArrayInputStream(java.io.ByteArrayInputStream) Pair(org.apache.commons.lang3.tuple.Pair) Map(java.util.Map) FutureUtil(org.apache.samza.util.FutureUtil) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) Checkpoint(org.apache.samza.checkpoint.Checkpoint) Collectors(java.util.stream.Collectors) DirDiff(org.apache.samza.storage.blobstore.diff.DirDiff) List(java.util.List) CompletionStage(java.util.concurrent.CompletionStage) SnapshotIndex(org.apache.samza.storage.blobstore.index.SnapshotIndex) BlobStoreBackupManagerMetrics(org.apache.samza.storage.blobstore.metrics.BlobStoreBackupManagerMetrics) Optional(java.util.Optional) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) CheckpointV2(org.apache.samza.checkpoint.CheckpointV2) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) BlobStoreManager(org.apache.samza.storage.blobstore.BlobStoreManager) BlobStoreStateBackendFactory(org.apache.samza.storage.blobstore.BlobStoreStateBackendFactory) ExecutorService(java.util.concurrent.ExecutorService) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) Logger(org.slf4j.Logger) Files(java.nio.file.Files) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) Metadata(org.apache.samza.storage.blobstore.Metadata) File(java.io.File) SamzaException(org.apache.samza.SamzaException) Paths(java.nio.file.Paths) CRC32(java.util.zip.CRC32) Preconditions(com.google.common.base.Preconditions) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) Collections(java.util.Collections) InputStream(java.io.InputStream) DeletedException(org.apache.samza.storage.blobstore.exceptions.DeletedException) FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) FileOutputStream(java.io.FileOutputStream) ArrayList(java.util.ArrayList) SamzaException(org.apache.samza.SamzaException) RetriableException(org.apache.samza.storage.blobstore.exceptions.RetriableException) CompletionException(java.util.concurrent.CompletionException) IOException(java.io.IOException) SamzaException(org.apache.samza.SamzaException) DeletedException(org.apache.samza.storage.blobstore.exceptions.DeletedException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 13 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class DirDiffUtil method getNewFilesToUpload.

/**
 * Returns a list of files to be uploaded to remote store that are part of new snapshot created locally.
 */
private static List<File> getNewFilesToUpload(List<FileIndex> remoteSnapshotFiles, List<File> localSnapshotFiles, BiPredicate<File, FileIndex> areSameFile) {
    List<File> filesToUpload = new ArrayList<>();
    Map<String, FileIndex> remoteFiles = remoteSnapshotFiles.stream().collect(Collectors.toMap(FileIndex::getFileName, Function.identity()));
    for (File localFile : localSnapshotFiles) {
        String localFileName = localFile.getName();
        if (!remoteFiles.containsKey(localFileName) || !areSameFile.test(localFile, remoteFiles.get(localFileName))) {
            LOG.debug("File {} only present in local snapshot or is not the same as remote file.", localFile.getPath());
            filesToUpload.add(localFile);
        }
    }
    return filesToUpload;
}
Also used : FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) ArrayList(java.util.ArrayList) File(java.io.File)

Example 14 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class DirDiffUtil method getFilesToRetain.

/**
 * Returns a list of common files between local and remote snapshot. These files are reused from prev remote snapshot
 * and do not need to be uploaded again.
 */
private static List<FileIndex> getFilesToRetain(List<FileIndex> remoteSnapshotFiles, List<File> localSnapshotFiles, BiPredicate<File, FileIndex> areSameFile) {
    List<FileIndex> filesToRetain = new ArrayList<>();
    Map<String, File> localFiles = localSnapshotFiles.stream().collect(Collectors.toMap(File::getName, Function.identity()));
    for (FileIndex remoteFile : remoteSnapshotFiles) {
        String remoteFileName = remoteFile.getFileName();
        if (localFiles.containsKey(remoteFileName) && areSameFile.test(localFiles.get(remoteFileName), remoteFile)) {
            String localFilePath = localFiles.get(remoteFileName).getPath();
            LOG.debug("File {} present in both local and remote snapshot and is the same.", localFilePath);
            filesToRetain.add(remoteFile);
        }
    }
    return filesToRetain;
}
Also used : FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) ArrayList(java.util.ArrayList) File(java.io.File)

Example 15 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class TestBlobStoreUtil method testAreSameFile.

@Test
public void testAreSameFile() throws IOException {
    FileUtil fileUtil = new FileUtil();
    // 1. test with sst file with same attributes
    Path sstFile = Files.createTempFile("samza-testAreSameFiles-", ".sst");
    PosixFileAttributes sstFileAttribs = Files.readAttributes(sstFile, PosixFileAttributes.class);
    FileMetadata sstFileMetadata = new FileMetadata(sstFileAttribs.creationTime().toMillis(), sstFileAttribs.lastModifiedTime().toMillis(), sstFileAttribs.size(), sstFileAttribs.owner().toString(), sstFileAttribs.group().toString(), PosixFilePermissions.toString(sstFileAttribs.permissions()));
    // checksum should be ignored for sst file. Set any dummy value
    FileIndex sstFileIndex = new FileIndex(sstFile.getFileName().toString(), Collections.emptyList(), sstFileMetadata, 0L);
    assertTrue(DirDiffUtil.areSameFile(false).test(sstFile.toFile(), sstFileIndex));
    // 2. test with sst file with different timestamps
    // Update last modified time
    Files.setLastModifiedTime(sstFile, FileTime.fromMillis(System.currentTimeMillis() + 1000L));
    assertTrue(DirDiffUtil.areSameFile(false).test(sstFile.toFile(), sstFileIndex));
    // 3. test with non-sst files with same metadata and content
    Path tmpFile = Files.createTempFile("samza-testAreSameFiles-", ".tmp");
    fileUtil.writeToTextFile(tmpFile.toFile(), RandomStringUtils.random(1000), false);
    PosixFileAttributes tmpFileAttribs = Files.readAttributes(tmpFile, PosixFileAttributes.class);
    FileMetadata tmpFileMetadata = new FileMetadata(tmpFileAttribs.creationTime().toMillis(), tmpFileAttribs.lastModifiedTime().toMillis(), tmpFileAttribs.size(), tmpFileAttribs.owner().toString(), tmpFileAttribs.group().toString(), PosixFilePermissions.toString(tmpFileAttribs.permissions()));
    FileIndex tmpFileIndex = new FileIndex(tmpFile.getFileName().toString(), Collections.emptyList(), tmpFileMetadata, FileUtils.checksumCRC32(tmpFile.toFile()));
    assertTrue(DirDiffUtil.areSameFile(false).test(tmpFile.toFile(), tmpFileIndex));
    // 4. test with non-sst files with different attributes
    // change lastModifiedTime of local file
    FileTime prevLastModified = tmpFileAttribs.lastModifiedTime();
    Files.setLastModifiedTime(tmpFile, FileTime.fromMillis(System.currentTimeMillis() + 1000L));
    assertTrue(DirDiffUtil.areSameFile(false).test(tmpFile.toFile(), tmpFileIndex));
    // change content/checksum of local file
    // reset attributes to match with remote file
    Files.setLastModifiedTime(tmpFile, prevLastModified);
    // new content
    fileUtil.writeToTextFile(tmpFile.toFile(), RandomStringUtils.random(1000), false);
    assertFalse(DirDiffUtil.areSameFile(false).test(tmpFile.toFile(), tmpFileIndex));
}
Also used : Path(java.nio.file.Path) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) FileTime(java.nio.file.attribute.FileTime) PosixFileAttributes(java.nio.file.attribute.PosixFileAttributes) FileUtil(org.apache.samza.util.FileUtil) Test(org.junit.Test)

Aggregations

FileIndex (org.apache.samza.storage.blobstore.index.FileIndex)17 ArrayList (java.util.ArrayList)15 FileMetadata (org.apache.samza.storage.blobstore.index.FileMetadata)13 File (java.io.File)12 DirIndex (org.apache.samza.storage.blobstore.index.DirIndex)12 CompletableFuture (java.util.concurrent.CompletableFuture)11 Metadata (org.apache.samza.storage.blobstore.Metadata)11 SnapshotMetadata (org.apache.samza.storage.blobstore.index.SnapshotMetadata)11 CompletionStage (java.util.concurrent.CompletionStage)10 FileBlob (org.apache.samza.storage.blobstore.index.FileBlob)10 IOException (java.io.IOException)9 CRC32 (java.util.zip.CRC32)8 SamzaException (org.apache.samza.SamzaException)8 DirDiff (org.apache.samza.storage.blobstore.diff.DirDiff)8 ImmutableMap (com.google.common.collect.ImmutableMap)7 FileOutputStream (java.io.FileOutputStream)7 InputStream (java.io.InputStream)7 Files (java.nio.file.Files)7 Paths (java.nio.file.Paths)7 Collections (java.util.Collections)7