Search in sources :

Example 6 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class DirDiffUtil method getFilesToRemove.

/**
 * Returns a list of files uploaded in remote checkpoint that are not present in new local snapshot and needs to be
 * deleted/reclaimed from remote store.
 */
private static List<FileIndex> getFilesToRemove(List<FileIndex> remoteSnapshotFiles, List<File> localSnapshotFiles, BiPredicate<File, FileIndex> areSameFile) {
    List<FileIndex> filesToRemove = new ArrayList<>();
    Map<String, File> localFiles = localSnapshotFiles.stream().collect(Collectors.toMap(File::getName, Function.identity()));
    for (FileIndex remoteFile : remoteSnapshotFiles) {
        String remoteFileName = remoteFile.getFileName();
        if (!localFiles.containsKey(remoteFileName) || !areSameFile.test(localFiles.get(remoteFileName), remoteFile)) {
            LOG.debug("File {} only present in remote snapshot or is not the same as local file.", remoteFile.getFileName());
            filesToRemove.add(remoteFile);
        }
    }
    return filesToRemove;
}
Also used : FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) ArrayList(java.util.ArrayList) File(java.io.File)

Example 7 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class DirDiffUtil method getDirDiff.

private static DirDiff getDirDiff(File localSnapshotDir, DirIndex remoteSnapshotDir, BiPredicate<File, FileIndex> areSameFile, boolean isRootDir) {
    Preconditions.checkState(localSnapshotDir != null && localSnapshotDir.isDirectory());
    Preconditions.checkNotNull(remoteSnapshotDir);
    LOG.debug("Creating DirDiff between local dir: {} and remote dir: {}", localSnapshotDir.getPath(), remoteSnapshotDir.getDirName());
    List<DirDiff> subDirsAdded = new ArrayList<>();
    List<DirDiff> subDirsRetained = new ArrayList<>();
    List<DirIndex> subDirsRemoved = new ArrayList<>();
    // list files returns empty list if local snapshot directory is empty
    List<File> localSnapshotFiles = Arrays.asList(Objects.requireNonNull(localSnapshotDir.listFiles(File::isFile)));
    List<FileIndex> remoteSnapshotFiles = remoteSnapshotDir.getFilesPresent();
    // list files returns empty list if local snapshot directory is empty
    List<File> localSnapshotSubDirs = Arrays.asList(Objects.requireNonNull(localSnapshotDir.listFiles(File::isDirectory)));
    Set<String> localSnapshotSubDirNames = localSnapshotSubDirs.stream().map(File::getName).collect(Collectors.toCollection(HashSet::new));
    List<DirIndex> remoteSnapshotSubDirs = remoteSnapshotDir.getSubDirsPresent();
    Set<String> remoteSnapshotSubDirNames = remoteSnapshotSubDirs.stream().map(DirIndex::getDirName).collect(Collectors.toCollection(HashSet::new));
    // TODO MED shesharm: this compares each file in directory 3 times. Categorize files in one traversal instead.
    List<File> filesToUpload = getNewFilesToUpload(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
    List<FileIndex> filesToRetain = getFilesToRetain(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
    List<FileIndex> filesToRemove = getFilesToRemove(remoteSnapshotFiles, localSnapshotFiles, areSameFile);
    for (File localSnapshotSubDir : localSnapshotSubDirs) {
        if (!remoteSnapshotSubDirNames.contains(localSnapshotSubDir.getName())) {
            LOG.debug("Subdir {} present in local snapshot but not in remote snapshot. " + "Recursively adding subdir contents.", localSnapshotSubDir.getPath());
            subDirsAdded.add(getDiffForNewDir(localSnapshotSubDir));
        } else {
            LOG.debug("Subdir {} present in local snapshot and in remote snapshot. " + "Recursively comparing local and remote subdirs.", localSnapshotSubDir.getPath());
            DirIndex remoteSubDirIndex = remoteSnapshotSubDirs.stream().filter(indexBlob -> indexBlob.getDirName().equals(localSnapshotSubDir.getName())).findFirst().get();
            subDirsRetained.add(getDirDiff(localSnapshotSubDir, remoteSubDirIndex, areSameFile, false));
        }
    }
    // 3. Subdir in remote snapshot but not in local snapshot
    for (DirIndex remoteSnapshotSubDir : remoteSnapshotSubDirs) {
        if (!localSnapshotSubDirNames.contains(remoteSnapshotSubDir.getDirName())) {
            LOG.debug("Subdir {} present in remote snapshot but not in local snapshot. " + "Marking for removal from remote snapshot. ", remoteSnapshotDir.getDirName());
            subDirsRemoved.add(remoteSnapshotSubDir);
        }
    }
    String dirName = isRootDir ? DirIndex.ROOT_DIR_NAME : localSnapshotDir.getName();
    return new DirDiff(dirName, filesToUpload, filesToRetain, filesToRemove, subDirsAdded, subDirsRetained, subDirsRemoved);
}
Also used : FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) ArrayList(java.util.ArrayList) DirDiff(org.apache.samza.storage.blobstore.diff.DirDiff) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) File(java.io.File)

Example 8 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class BlobStoreTestUtil method createFileIndex.

private static FileIndex createFileIndex(String filePath, DirTreeNode node) {
    long checksum;
    FileMetadata fileMetadata;
    try {
        Path path = Paths.get(filePath);
        Checksum crc32 = new CRC32();
        byte[] fileBytes = Files.readAllBytes(path);
        crc32.update(fileBytes, 0, fileBytes.length);
        checksum = crc32.getValue();
        fileMetadata = FileMetadata.fromFile(path.toFile());
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    return new FileIndex(node.fileName, ImmutableList.of(new FileBlob(node.fileName, 0)), fileMetadata, checksum);
}
Also used : Path(java.nio.file.Path) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) CRC32(java.util.zip.CRC32) Checksum(java.util.zip.Checksum) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) IOException(java.io.IOException)

Example 9 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class TestBlobStoreUtil method testRestoreDirRestoresMultiPartFilesCorrectly.

@Test
public void testRestoreDirRestoresMultiPartFilesCorrectly() throws IOException {
    Path restoreDirBasePath = Files.createTempDirectory(BlobStoreTestUtil.TEMP_DIR_PREFIX);
    // remote file == 26 blobs, blob ids from a to z, blob contents from a to z, offsets 0 to 25.
    DirIndex mockDirIndex = mock(DirIndex.class);
    when(mockDirIndex.getDirName()).thenReturn(DirIndex.ROOT_DIR_NAME);
    FileIndex mockFileIndex = mock(FileIndex.class);
    when(mockFileIndex.getFileName()).thenReturn("1.sst");
    // setup mock file attributes. create a temp file to get current user/group/permissions so that they
    // match with restored files.
    File tmpFile = Paths.get(restoreDirBasePath.toString(), "tempfile-" + new Random().nextInt()).toFile();
    tmpFile.createNewFile();
    PosixFileAttributes attrs = Files.readAttributes(tmpFile.toPath(), PosixFileAttributes.class);
    FileMetadata fileMetadata = new // ctime mtime does not matter. size == 26
    FileMetadata(// ctime mtime does not matter. size == 26
    1234L, // ctime mtime does not matter. size == 26
    1243L, // ctime mtime does not matter. size == 26
    26, attrs.owner().getName(), attrs.group().getName(), PosixFilePermissions.toString(attrs.permissions()));
    when(mockFileIndex.getFileMetadata()).thenReturn(fileMetadata);
    // delete so that it doesn't show up in restored dir contents.
    Files.delete(tmpFile.toPath());
    List<FileBlob> mockFileBlobs = new ArrayList<>();
    StringBuilder fileContents = new StringBuilder();
    for (int i = 0; i < 26; i++) {
        FileBlob mockFileBlob = mock(FileBlob.class);
        char c = (char) ('a' + i);
        // blob contents == blobId
        fileContents.append(c);
        when(mockFileBlob.getBlobId()).thenReturn(String.valueOf(c));
        when(mockFileBlob.getOffset()).thenReturn(i);
        mockFileBlobs.add(mockFileBlob);
    }
    when(mockFileIndex.getBlobs()).thenReturn(mockFileBlobs);
    CRC32 checksum = new CRC32();
    checksum.update(fileContents.toString().getBytes());
    when(mockFileIndex.getChecksum()).thenReturn(checksum.getValue());
    when(mockDirIndex.getFilesPresent()).thenReturn(ImmutableList.of(mockFileIndex));
    BlobStoreManager mockBlobStoreManager = mock(BlobStoreManager.class);
    when(mockBlobStoreManager.get(anyString(), any(OutputStream.class), any(Metadata.class))).thenAnswer((Answer<CompletionStage<Void>>) invocationOnMock -> {
        String blobId = invocationOnMock.getArgumentAt(0, String.class);
        OutputStream outputStream = invocationOnMock.getArgumentAt(1, OutputStream.class);
        outputStream.write(blobId.getBytes());
        ((FileOutputStream) outputStream).getFD().sync();
        return CompletableFuture.completedFuture(null);
    });
    BlobStoreUtil blobStoreUtil = new BlobStoreUtil(mockBlobStoreManager, EXECUTOR, null, null);
    blobStoreUtil.restoreDir(restoreDirBasePath.toFile(), mockDirIndex, metadata).join();
    assertTrue(new DirDiffUtil().areSameDir(Collections.emptySet(), false).test(restoreDirBasePath.toFile(), mockDirIndex));
}
Also used : Path(java.nio.file.Path) SortedSet(java.util.SortedSet) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) FileTime(java.nio.file.attribute.FileTime) TimeoutException(java.util.concurrent.TimeoutException) Random(java.util.Random) RetriableException(org.apache.samza.storage.blobstore.exceptions.RetriableException) FileUtil(org.apache.samza.util.FileUtil) Pair(org.apache.commons.lang3.tuple.Pair) Map(java.util.Map) Path(java.nio.file.Path) FutureUtil(org.apache.samza.util.FutureUtil) ImmutableSet(com.google.common.collect.ImmutableSet) PosixFileAttributes(java.nio.file.attribute.PosixFileAttributes) ImmutableMap(com.google.common.collect.ImmutableMap) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) Checkpoint(org.apache.samza.checkpoint.Checkpoint) DirDiff(org.apache.samza.storage.blobstore.diff.DirDiff) CheckpointId(org.apache.samza.checkpoint.CheckpointId) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) CompletionStage(java.util.concurrent.CompletionStage) SnapshotIndex(org.apache.samza.storage.blobstore.index.SnapshotIndex) Optional(java.util.Optional) RandomStringUtils(org.apache.commons.lang3.RandomStringUtils) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) MoreExecutors(com.google.common.util.concurrent.MoreExecutors) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) Matchers(org.mockito.Matchers) CheckpointV2(org.apache.samza.checkpoint.CheckpointV2) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Answer(org.mockito.stubbing.Answer) PosixFilePermissions(java.nio.file.attribute.PosixFilePermissions) ArgumentCaptor(org.mockito.ArgumentCaptor) ImmutableList(com.google.common.collect.ImmutableList) BlobStoreManager(org.apache.samza.storage.blobstore.BlobStoreManager) BlobStoreStateBackendFactory(org.apache.samza.storage.blobstore.BlobStoreStateBackendFactory) ExecutorService(java.util.concurrent.ExecutorService) OutputStream(java.io.OutputStream) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) Files(java.nio.file.Files) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) FileUtils(org.apache.commons.io.FileUtils) Test(org.junit.Test) Metadata(org.apache.samza.storage.blobstore.Metadata) File(java.io.File) SamzaException(org.apache.samza.SamzaException) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) Mockito(org.mockito.Mockito) Ignore(org.junit.Ignore) Paths(java.nio.file.Paths) NullOutputStream(org.apache.commons.io.output.NullOutputStream) CRC32(java.util.zip.CRC32) Assert(org.junit.Assert) Collections(java.util.Collections) InputStream(java.io.InputStream) DeletedException(org.apache.samza.storage.blobstore.exceptions.DeletedException) FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) CRC32(java.util.zip.CRC32) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) NullOutputStream(org.apache.commons.io.output.NullOutputStream) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) ArrayList(java.util.ArrayList) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) Metadata(org.apache.samza.storage.blobstore.Metadata) BlobStoreManager(org.apache.samza.storage.blobstore.BlobStoreManager) Checkpoint(org.apache.samza.checkpoint.Checkpoint) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) Random(java.util.Random) FileOutputStream(java.io.FileOutputStream) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) PosixFileAttributes(java.nio.file.attribute.PosixFileAttributes) File(java.io.File) CompletionStage(java.util.concurrent.CompletionStage) Test(org.junit.Test)

Example 10 with FileIndex

use of org.apache.samza.storage.blobstore.index.FileIndex in project samza by apache.

the class TestBlobStoreUtil method testPutFileChecksumAndMetadata.

@Test
public void testPutFileChecksumAndMetadata() throws IOException, ExecutionException, InterruptedException {
    // Setup
    SnapshotMetadata snapshotMetadata = new SnapshotMetadata(checkpointId, jobName, jobId, taskName, storeName);
    Path path = Files.createTempFile("samza-testPutFileChecksum-", ".tmp");
    FileUtil fileUtil = new FileUtil();
    fileUtil.writeToTextFile(path.toFile(), RandomStringUtils.random(1000), false);
    long expectedChecksum = FileUtils.checksumCRC32(path.toFile());
    BlobStoreManager blobStoreManager = mock(BlobStoreManager.class);
    ArgumentCaptor<Metadata> argumentCaptor = ArgumentCaptor.forClass(Metadata.class);
    when(blobStoreManager.put(any(InputStream.class), argumentCaptor.capture())).thenAnswer((Answer<CompletionStage<String>>) invocation -> {
        InputStream inputStream = invocation.getArgumentAt(0, InputStream.class);
        IOUtils.copy(inputStream, NullOutputStream.NULL_OUTPUT_STREAM);
        return CompletableFuture.completedFuture("blobId");
    });
    BlobStoreUtil blobStoreUtil = new BlobStoreUtil(blobStoreManager, EXECUTOR, null, null);
    CompletionStage<FileIndex> fileIndexFuture = blobStoreUtil.putFile(path.toFile(), snapshotMetadata);
    FileIndex fileIndex = null;
    try {
        // should be already complete. if not, future composition in putFile is broken.
        fileIndex = fileIndexFuture.toCompletableFuture().get(0, TimeUnit.MILLISECONDS);
    } catch (TimeoutException e) {
        fail("Future returned from putFile should be already complete.");
    }
    // Assert
    Metadata metadata = (Metadata) argumentCaptor.getValue();
    assertEquals(path.toAbsolutePath().toString(), metadata.getPayloadPath());
    assertEquals(path.toFile().length(), Long.valueOf(metadata.getPayloadSize()).longValue());
    assertEquals(expectedChecksum, fileIndex.getChecksum());
}
Also used : Path(java.nio.file.Path) SortedSet(java.util.SortedSet) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) FileTime(java.nio.file.attribute.FileTime) TimeoutException(java.util.concurrent.TimeoutException) Random(java.util.Random) RetriableException(org.apache.samza.storage.blobstore.exceptions.RetriableException) FileUtil(org.apache.samza.util.FileUtil) Pair(org.apache.commons.lang3.tuple.Pair) Map(java.util.Map) Path(java.nio.file.Path) FutureUtil(org.apache.samza.util.FutureUtil) ImmutableSet(com.google.common.collect.ImmutableSet) PosixFileAttributes(java.nio.file.attribute.PosixFileAttributes) ImmutableMap(com.google.common.collect.ImmutableMap) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) Checkpoint(org.apache.samza.checkpoint.Checkpoint) DirDiff(org.apache.samza.storage.blobstore.diff.DirDiff) CheckpointId(org.apache.samza.checkpoint.CheckpointId) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) CompletionStage(java.util.concurrent.CompletionStage) SnapshotIndex(org.apache.samza.storage.blobstore.index.SnapshotIndex) Optional(java.util.Optional) RandomStringUtils(org.apache.commons.lang3.RandomStringUtils) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) MoreExecutors(com.google.common.util.concurrent.MoreExecutors) DirIndex(org.apache.samza.storage.blobstore.index.DirIndex) FileBlob(org.apache.samza.storage.blobstore.index.FileBlob) Matchers(org.mockito.Matchers) CheckpointV2(org.apache.samza.checkpoint.CheckpointV2) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Answer(org.mockito.stubbing.Answer) PosixFilePermissions(java.nio.file.attribute.PosixFilePermissions) ArgumentCaptor(org.mockito.ArgumentCaptor) ImmutableList(com.google.common.collect.ImmutableList) BlobStoreManager(org.apache.samza.storage.blobstore.BlobStoreManager) BlobStoreStateBackendFactory(org.apache.samza.storage.blobstore.BlobStoreStateBackendFactory) ExecutorService(java.util.concurrent.ExecutorService) OutputStream(java.io.OutputStream) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) Files(java.nio.file.Files) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) FileUtils(org.apache.commons.io.FileUtils) Test(org.junit.Test) Metadata(org.apache.samza.storage.blobstore.Metadata) File(java.io.File) SamzaException(org.apache.samza.SamzaException) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) Mockito(org.mockito.Mockito) Ignore(org.junit.Ignore) Paths(java.nio.file.Paths) NullOutputStream(org.apache.commons.io.output.NullOutputStream) CRC32(java.util.zip.CRC32) Assert(org.junit.Assert) Collections(java.util.Collections) InputStream(java.io.InputStream) DeletedException(org.apache.samza.storage.blobstore.exceptions.DeletedException) InputStream(java.io.InputStream) FileMetadata(org.apache.samza.storage.blobstore.index.FileMetadata) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) Metadata(org.apache.samza.storage.blobstore.Metadata) BlobStoreManager(org.apache.samza.storage.blobstore.BlobStoreManager) FileIndex(org.apache.samza.storage.blobstore.index.FileIndex) SnapshotMetadata(org.apache.samza.storage.blobstore.index.SnapshotMetadata) FileUtil(org.apache.samza.util.FileUtil) CompletionStage(java.util.concurrent.CompletionStage) TimeoutException(java.util.concurrent.TimeoutException) Test(org.junit.Test)

Aggregations

FileIndex (org.apache.samza.storage.blobstore.index.FileIndex)17 ArrayList (java.util.ArrayList)15 FileMetadata (org.apache.samza.storage.blobstore.index.FileMetadata)13 File (java.io.File)12 DirIndex (org.apache.samza.storage.blobstore.index.DirIndex)12 CompletableFuture (java.util.concurrent.CompletableFuture)11 Metadata (org.apache.samza.storage.blobstore.Metadata)11 SnapshotMetadata (org.apache.samza.storage.blobstore.index.SnapshotMetadata)11 CompletionStage (java.util.concurrent.CompletionStage)10 FileBlob (org.apache.samza.storage.blobstore.index.FileBlob)10 IOException (java.io.IOException)9 CRC32 (java.util.zip.CRC32)8 SamzaException (org.apache.samza.SamzaException)8 DirDiff (org.apache.samza.storage.blobstore.diff.DirDiff)8 ImmutableMap (com.google.common.collect.ImmutableMap)7 FileOutputStream (java.io.FileOutputStream)7 InputStream (java.io.InputStream)7 Files (java.nio.file.Files)7 Paths (java.nio.file.Paths)7 Collections (java.util.Collections)7