use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class BlobStoreUtil method putDir.
/**
* Recursively upload all new files and upload or update contents of all subdirs in the {@link DirDiff} and return a
* Future containing the {@link DirIndex} associated with the directory.
* @param dirDiff diff for the contents of this directory
* @return A future with the {@link DirIndex} if the upload completed successfully.
*/
public CompletionStage<DirIndex> putDir(DirDiff dirDiff, SnapshotMetadata snapshotMetadata) {
// Upload all new files in the dir
List<File> filesToUpload = dirDiff.getFilesAdded();
List<CompletionStage<FileIndex>> fileFutures = filesToUpload.stream().map(file -> putFile(file, snapshotMetadata)).collect(Collectors.toList());
CompletableFuture<Void> allFilesFuture = CompletableFuture.allOf(fileFutures.toArray(new CompletableFuture[0]));
List<CompletionStage<DirIndex>> subDirFutures = new ArrayList<>();
// recursively upload all new subdirs of this dir
for (DirDiff subDirAdded : dirDiff.getSubDirsAdded()) {
subDirFutures.add(putDir(subDirAdded, snapshotMetadata));
}
// recursively update contents of all subdirs that are retained but might have been modified
for (DirDiff subDirRetained : dirDiff.getSubDirsRetained()) {
subDirFutures.add(putDir(subDirRetained, snapshotMetadata));
}
CompletableFuture<Void> allDirBlobsFuture = CompletableFuture.allOf(subDirFutures.toArray(new CompletableFuture[0]));
return CompletableFuture.allOf(allDirBlobsFuture, allFilesFuture).thenApplyAsync(f -> {
LOG.trace("All file and dir uploads complete for task: {} store: {}", snapshotMetadata.getTaskName(), snapshotMetadata.getStoreName());
List<FileIndex> filesPresent = fileFutures.stream().map(blob -> blob.toCompletableFuture().join()).collect(Collectors.toList());
filesPresent.addAll(dirDiff.getFilesRetained());
List<DirIndex> subDirsPresent = subDirFutures.stream().map(subDir -> subDir.toCompletableFuture().join()).collect(Collectors.toList());
LOG.debug("Uploaded diff for task: {} store: {} with statistics: {}", snapshotMetadata.getTaskName(), snapshotMetadata.getStoreName(), DirDiff.getStats(dirDiff));
LOG.trace("Returning new DirIndex for task: {} store: {}", snapshotMetadata.getTaskName(), snapshotMetadata.getStoreName());
return new DirIndex(dirDiff.getDirName(), filesPresent, dirDiff.getFilesRemoved(), subDirsPresent, dirDiff.getSubDirsRemoved());
}, executor);
}
use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class BlobStoreUtil method getFile.
/**
* Gets a file from the blob store.
* @param fileBlobs List of {@link FileBlob}s that constitute this file.
* @param fileToRestore File pointing to the local path where the file will be restored.
* @param requestMetadata {@link Metadata} associated with this request
* @return a future that completes when the file is downloaded and written or if an exception occurs.
*/
@VisibleForTesting
CompletableFuture<Void> getFile(List<FileBlob> fileBlobs, File fileToRestore, Metadata requestMetadata) {
FileOutputStream outputStream = null;
try {
long restoreFileStartTime = System.nanoTime();
if (fileToRestore.exists()) {
// delete the file if it already exists, e.g. from a previous retry.
Files.delete(fileToRestore.toPath());
}
outputStream = new FileOutputStream(fileToRestore);
final FileOutputStream finalOutputStream = outputStream;
// TODO HIGH shesharm add integration tests to ensure empty files and directories are handled correctly E2E.
// create file for 0 byte files (fileIndex entry but no fileBlobs).
fileToRestore.createNewFile();
// create a copy to ensure list being sorted is mutable.
List<FileBlob> fileBlobsCopy = new ArrayList<>(fileBlobs);
// sort by offset.
fileBlobsCopy.sort(Comparator.comparingInt(FileBlob::getOffset));
// chain the futures such that write to file for blobs is sequential.
// can be optimized to write concurrently to the file later.
CompletableFuture<Void> resultFuture = CompletableFuture.completedFuture(null);
for (FileBlob fileBlob : fileBlobsCopy) {
resultFuture = resultFuture.thenComposeAsync(v -> {
LOG.debug("Starting restore for file: {} with blob id: {} at offset: {}", fileToRestore, fileBlob.getBlobId(), fileBlob.getOffset());
return blobStoreManager.get(fileBlob.getBlobId(), finalOutputStream, requestMetadata);
}, executor);
}
resultFuture = resultFuture.thenRunAsync(() -> {
LOG.debug("Finished restore for file: {}. Closing output stream.", fileToRestore);
try {
// flush the file contents to disk
finalOutputStream.getFD().sync();
finalOutputStream.close();
} catch (Exception e) {
throw new SamzaException(String.format("Error closing output stream for file: %s", fileToRestore.getAbsolutePath()), e);
}
}, executor);
resultFuture.whenComplete((res, ex) -> {
if (restoreMetrics != null) {
restoreMetrics.avgFileRestoreNs.update(System.nanoTime() - restoreFileStartTime);
long fileSize = requestMetadata.getPayloadSize();
restoreMetrics.restoreRate.inc(fileSize);
restoreMetrics.filesRestored.getValue().addAndGet(1);
restoreMetrics.bytesRestored.getValue().addAndGet(fileSize);
restoreMetrics.filesRemaining.getValue().addAndGet(-1);
restoreMetrics.bytesRemaining.getValue().addAndGet(-1 * fileSize);
}
});
return resultFuture;
} catch (Exception exception) {
try {
if (outputStream != null) {
outputStream.close();
}
} catch (Exception err) {
LOG.error("Error closing output stream for file: {}", fileToRestore.getAbsolutePath(), err);
}
throw new SamzaException(String.format("Error restoring file: %s in path: %s", fileToRestore.getName(), requestMetadata.getPayloadPath()), exception);
}
}
use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class DirDiffUtil method getNewFilesToUpload.
/**
* Returns a list of files to be uploaded to remote store that are part of new snapshot created locally.
*/
private static List<File> getNewFilesToUpload(List<FileIndex> remoteSnapshotFiles, List<File> localSnapshotFiles, BiPredicate<File, FileIndex> areSameFile) {
List<File> filesToUpload = new ArrayList<>();
Map<String, FileIndex> remoteFiles = remoteSnapshotFiles.stream().collect(Collectors.toMap(FileIndex::getFileName, Function.identity()));
for (File localFile : localSnapshotFiles) {
String localFileName = localFile.getName();
if (!remoteFiles.containsKey(localFileName) || !areSameFile.test(localFile, remoteFiles.get(localFileName))) {
LOG.debug("File {} only present in local snapshot or is not the same as remote file.", localFile.getPath());
filesToUpload.add(localFile);
}
}
return filesToUpload;
}
use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class DirDiffUtil method getFilesToRetain.
/**
* Returns a list of common files between local and remote snapshot. These files are reused from prev remote snapshot
* and do not need to be uploaded again.
*/
private static List<FileIndex> getFilesToRetain(List<FileIndex> remoteSnapshotFiles, List<File> localSnapshotFiles, BiPredicate<File, FileIndex> areSameFile) {
List<FileIndex> filesToRetain = new ArrayList<>();
Map<String, File> localFiles = localSnapshotFiles.stream().collect(Collectors.toMap(File::getName, Function.identity()));
for (FileIndex remoteFile : remoteSnapshotFiles) {
String remoteFileName = remoteFile.getFileName();
if (localFiles.containsKey(remoteFileName) && areSameFile.test(localFiles.get(remoteFileName), remoteFile)) {
String localFilePath = localFiles.get(remoteFileName).getPath();
LOG.debug("File {} present in both local and remote snapshot and is the same.", localFilePath);
filesToRetain.add(remoteFile);
}
}
return filesToRetain;
}
use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class TestBlobStoreUtil method testAreSameFile.
@Test
public void testAreSameFile() throws IOException {
FileUtil fileUtil = new FileUtil();
// 1. test with sst file with same attributes
Path sstFile = Files.createTempFile("samza-testAreSameFiles-", ".sst");
PosixFileAttributes sstFileAttribs = Files.readAttributes(sstFile, PosixFileAttributes.class);
FileMetadata sstFileMetadata = new FileMetadata(sstFileAttribs.creationTime().toMillis(), sstFileAttribs.lastModifiedTime().toMillis(), sstFileAttribs.size(), sstFileAttribs.owner().toString(), sstFileAttribs.group().toString(), PosixFilePermissions.toString(sstFileAttribs.permissions()));
// checksum should be ignored for sst file. Set any dummy value
FileIndex sstFileIndex = new FileIndex(sstFile.getFileName().toString(), Collections.emptyList(), sstFileMetadata, 0L);
assertTrue(DirDiffUtil.areSameFile(false).test(sstFile.toFile(), sstFileIndex));
// 2. test with sst file with different timestamps
// Update last modified time
Files.setLastModifiedTime(sstFile, FileTime.fromMillis(System.currentTimeMillis() + 1000L));
assertTrue(DirDiffUtil.areSameFile(false).test(sstFile.toFile(), sstFileIndex));
// 3. test with non-sst files with same metadata and content
Path tmpFile = Files.createTempFile("samza-testAreSameFiles-", ".tmp");
fileUtil.writeToTextFile(tmpFile.toFile(), RandomStringUtils.random(1000), false);
PosixFileAttributes tmpFileAttribs = Files.readAttributes(tmpFile, PosixFileAttributes.class);
FileMetadata tmpFileMetadata = new FileMetadata(tmpFileAttribs.creationTime().toMillis(), tmpFileAttribs.lastModifiedTime().toMillis(), tmpFileAttribs.size(), tmpFileAttribs.owner().toString(), tmpFileAttribs.group().toString(), PosixFilePermissions.toString(tmpFileAttribs.permissions()));
FileIndex tmpFileIndex = new FileIndex(tmpFile.getFileName().toString(), Collections.emptyList(), tmpFileMetadata, FileUtils.checksumCRC32(tmpFile.toFile()));
assertTrue(DirDiffUtil.areSameFile(false).test(tmpFile.toFile(), tmpFileIndex));
// 4. test with non-sst files with different attributes
// change lastModifiedTime of local file
FileTime prevLastModified = tmpFileAttribs.lastModifiedTime();
Files.setLastModifiedTime(tmpFile, FileTime.fromMillis(System.currentTimeMillis() + 1000L));
assertTrue(DirDiffUtil.areSameFile(false).test(tmpFile.toFile(), tmpFileIndex));
// change content/checksum of local file
// reset attributes to match with remote file
Files.setLastModifiedTime(tmpFile, prevLastModified);
// new content
fileUtil.writeToTextFile(tmpFile.toFile(), RandomStringUtils.random(1000), false);
assertFalse(DirDiffUtil.areSameFile(false).test(tmpFile.toFile(), tmpFileIndex));
}
Aggregations