use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class DirDiffUtil method getFilesToRemove.
/**
* Returns a list of files uploaded in remote checkpoint that are not present in new local snapshot and needs to be
* deleted/reclaimed from remote store.
*/
private static List<FileIndex> getFilesToRemove(List<FileIndex> remoteSnapshotFiles, List<File> localSnapshotFiles, BiPredicate<File, FileIndex> areSameFile) {
List<FileIndex> filesToRemove = new ArrayList<>();
Map<String, File> localFiles = localSnapshotFiles.stream().collect(Collectors.toMap(File::getName, Function.identity()));
for (FileIndex remoteFile : remoteSnapshotFiles) {
String remoteFileName = remoteFile.getFileName();
if (!localFiles.containsKey(remoteFileName) || !areSameFile.test(localFiles.get(remoteFileName), remoteFile)) {
LOG.debug("File {} only present in remote snapshot or is not the same as local file.", remoteFile.getFileName());
filesToRemove.add(remoteFile);
}
}
return filesToRemove;
}
use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class DirDiffUtil method getDirDiff.
private static DirDiff getDirDiff(File localSnapshotDir, DirIndex remoteSnapshotDir, BiPredicate<File, FileIndex> areSameFile, boolean isRootDir) {
Preconditions.checkState(localSnapshotDir != null && localSnapshotDir.isDirectory());
Preconditions.checkNotNull(remoteSnapshotDir);
LOG.debug("Creating DirDiff between local dir: {} and remote dir: {}", localSnapshotDir.getPath(), remoteSnapshotDir.getDirName());
List<DirDiff> subDirsAdded = new ArrayList<>();
List<DirDiff> subDirsRetained = new ArrayList<>();
List<DirIndex> subDirsRemoved = new ArrayList<>();
// list files returns empty list if local snapshot directory is empty
List<File> localSnapshotFiles = Arrays.asList(Objects.requireNonNull(localSnapshotDir.listFiles(File::isFile)));
List<FileIndex> remoteSnapshotFiles = remoteSnapshotDir.getFilesPresent();
// list files returns empty list if local snapshot directory is empty
List<File> localSnapshotSubDirs = Arrays.asList(Objects.requireNonNull(localSnapshotDir.listFiles(File::isDirectory)));
Set<String> localSnapshotSubDirNames = localSnapshotSubDirs.stream().map(File::getName).collect(Collectors.toCollection(HashSet::new));
List<DirIndex> remoteSnapshotSubDirs = remoteSnapshotDir.getSubDirsPresent();
Set<String> remoteSnapshotSubDirNames = remoteSnapshotSubDirs.stream().map(DirIndex::getDirName).collect(Collectors.toCollection(HashSet::new));
// TODO MED shesharm: this compares each file in directory 3 times. Categorize files in one traversal instead.
List<File> filesToUpload = getNewFilesToUpload(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
List<FileIndex> filesToRetain = getFilesToRetain(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
List<FileIndex> filesToRemove = getFilesToRemove(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
for (File localSnapshotSubDir : localSnapshotSubDirs) {
if (!remoteSnapshotSubDirNames.contains(localSnapshotSubDir.getName())) {
LOG.debug("Subdir {} present in local snapshot but not in remote snapshot. " + "Recursively adding subdir contents.", localSnapshotSubDir.getPath());
subDirsAdded.add(getDiffForNewDir(localSnapshotSubDir));
} else {
LOG.debug("Subdir {} present in local snapshot and in remote snapshot. " + "Recursively comparing local and remote subdirs.", localSnapshotSubDir.getPath());
DirIndex remoteSubDirIndex = remoteSnapshotSubDirs.stream().filter(indexBlob -> indexBlob.getDirName().equals(localSnapshotSubDir.getName())).findFirst().get();
subDirsRetained.add(getDirDiff(localSnapshotSubDir, remoteSubDirIndex, areSameFile, false));
}
}
// 3. Subdir in remote snapshot but not in local snapshot
for (DirIndex remoteSnapshotSubDir : remoteSnapshotSubDirs) {
if (!localSnapshotSubDirNames.contains(remoteSnapshotSubDir.getDirName())) {
LOG.debug("Subdir {} present in remote snapshot but not in local snapshot. " + "Marking for removal from remote snapshot. ", remoteSnapshotDir.getDirName());
subDirsRemoved.add(remoteSnapshotSubDir);
}
}
String dirName = isRootDir ? DirIndex.ROOT_DIR_NAME : localSnapshotDir.getName();
return new DirDiff(dirName, filesToUpload, filesToRetain, filesToRemove, subDirsAdded, subDirsRetained, subDirsRemoved);
}
use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class BlobStoreTestUtil method createFileIndex.
private static FileIndex createFileIndex(String filePath, DirTreeNode node) {
long checksum;
FileMetadata fileMetadata;
try {
Path path = Paths.get(filePath);
Checksum crc32 = new CRC32();
byte[] fileBytes = Files.readAllBytes(path);
crc32.update(fileBytes, 0, fileBytes.length);
checksum = crc32.getValue();
fileMetadata = FileMetadata.fromFile(path.toFile());
} catch (Exception e) {
throw new RuntimeException(e);
}
return new FileIndex(node.fileName, ImmutableList.of(new FileBlob(node.fileName, 0)), fileMetadata, checksum);
}
use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class TestBlobStoreUtil method testRestoreDirRestoresMultiPartFilesCorrectly.
@Test
public void testRestoreDirRestoresMultiPartFilesCorrectly() throws IOException {
Path restoreDirBasePath = Files.createTempDirectory(BlobStoreTestUtil.TEMP_DIR_PREFIX);
// remote file == 26 blobs, blob ids from a to z, blob contents from a to z, offsets 0 to 25.
DirIndex mockDirIndex = mock(DirIndex.class);
when(mockDirIndex.getDirName()).thenReturn(DirIndex.ROOT_DIR_NAME);
FileIndex mockFileIndex = mock(FileIndex.class);
when(mockFileIndex.getFileName()).thenReturn("1.sst");
// setup mock file attributes. create a temp file to get current user/group/permissions so that they
// match with restored files.
File tmpFile = Paths.get(restoreDirBasePath.toString(), "tempfile-" + new Random().nextInt()).toFile();
tmpFile.createNewFile();
PosixFileAttributes attrs = Files.readAttributes(tmpFile.toPath(), PosixFileAttributes.class);
FileMetadata fileMetadata = new // ctime mtime does not matter. size == 26
FileMetadata(// ctime mtime does not matter. size == 26
1234L, // ctime mtime does not matter. size == 26
1243L, // ctime mtime does not matter. size == 26
26, attrs.owner().getName(), attrs.group().getName(), PosixFilePermissions.toString(attrs.permissions()));
when(mockFileIndex.getFileMetadata()).thenReturn(fileMetadata);
// delete so that it doesn't show up in restored dir contents.
Files.delete(tmpFile.toPath());
List<FileBlob> mockFileBlobs = new ArrayList<>();
StringBuilder fileContents = new StringBuilder();
for (int i = 0; i < 26; i++) {
FileBlob mockFileBlob = mock(FileBlob.class);
char c = (char) ('a' + i);
// blob contents == blobId
fileContents.append(c);
when(mockFileBlob.getBlobId()).thenReturn(String.valueOf(c));
when(mockFileBlob.getOffset()).thenReturn(i);
mockFileBlobs.add(mockFileBlob);
}
when(mockFileIndex.getBlobs()).thenReturn(mockFileBlobs);
CRC32 checksum = new CRC32();
checksum.update(fileContents.toString().getBytes());
when(mockFileIndex.getChecksum()).thenReturn(checksum.getValue());
when(mockDirIndex.getFilesPresent()).thenReturn(ImmutableList.of(mockFileIndex));
BlobStoreManager mockBlobStoreManager = mock(BlobStoreManager.class);
when(mockBlobStoreManager.get(anyString(), any(OutputStream.class), any(Metadata.class))).thenAnswer((Answer<CompletionStage<Void>>) invocationOnMock -> {
String blobId = invocationOnMock.getArgumentAt(0, String.class);
OutputStream outputStream = invocationOnMock.getArgumentAt(1, OutputStream.class);
outputStream.write(blobId.getBytes());
((FileOutputStream) outputStream).getFD().sync();
return CompletableFuture.completedFuture(null);
});
BlobStoreUtil blobStoreUtil = new BlobStoreUtil(mockBlobStoreManager, EXECUTOR, null, null);
blobStoreUtil.restoreDir(restoreDirBasePath.toFile(), mockDirIndex, metadata).join();
assertTrue(new DirDiffUtil().areSameDir(Collections.emptySet(), false).test(restoreDirBasePath.toFile(), mockDirIndex));
}
use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.
the class TestBlobStoreUtil method testPutFileChecksumAndMetadata.
@Test
public void testPutFileChecksumAndMetadata() throws IOException, ExecutionException, InterruptedException {
// Setup
SnapshotMetadata snapshotMetadata = new SnapshotMetadata(checkpointId, jobName, jobId, taskName, storeName);
Path path = Files.createTempFile("samza-testPutFileChecksum-", ".tmp");
FileUtil fileUtil = new FileUtil();
fileUtil.writeToTextFile(path.toFile(), RandomStringUtils.random(1000), false);
long expectedChecksum = FileUtils.checksumCRC32(path.toFile());
BlobStoreManager blobStoreManager = mock(BlobStoreManager.class);
ArgumentCaptor<Metadata> argumentCaptor = ArgumentCaptor.forClass(Metadata.class);
when(blobStoreManager.put(any(InputStream.class), argumentCaptor.capture())).thenAnswer((Answer<CompletionStage<String>>) invocation -> {
InputStream inputStream = invocation.getArgumentAt(0, InputStream.class);
IOUtils.copy(inputStream, NullOutputStream.NULL_OUTPUT_STREAM);
return CompletableFuture.completedFuture("blobId");
});
BlobStoreUtil blobStoreUtil = new BlobStoreUtil(blobStoreManager, EXECUTOR, null, null);
CompletionStage<FileIndex> fileIndexFuture = blobStoreUtil.putFile(path.toFile(), snapshotMetadata);
FileIndex fileIndex = null;
try {
// should be already complete. if not, future composition in putFile is broken.
fileIndex = fileIndexFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
} catch (TimeoutException e) {
fail("Future returned from putFile should be already complete.");
}
// Assert
Metadata metadata = (Metadata) argumentCaptor.getValue();
assertEquals(path.toAbsolutePath().toString(), metadata.getPayloadPath());
assertEquals(path.toFile().length(), Long.valueOf(metadata.getPayloadSize()).longValue());
assertEquals(expectedChecksum, fileIndex.getChecksum());
}
Aggregations