Search in sources :

Example 6 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class RecoveryHelperTest method testPersistFile.

@Test
public void testPersistFile() throws Exception {
    String content = "contents";
    File stagingDir = Files.createTempDir();
    stagingDir.deleteOnExit();
    File file = new File(stagingDir, "file");
    OutputStream os = new FileOutputStream(file);
    IOUtils.write(content, os);
    os.close();
    Assert.assertEquals(stagingDir.listFiles().length, 1);
    State state = new State();
    state.setProp(RecoveryHelper.PERSIST_DIR_KEY, this.tmpDir.getAbsolutePath());
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher");
    File recoveryDir = new File(RecoveryHelper.getPersistDir(state).get().toUri().getPath());
    FileSystem fs = FileSystem.getLocal(new Configuration());
    CopyableFile copyableFile = CopyableFile.builder(fs, new FileStatus(0, false, 0, 0, 0, new Path("/file")), new Path("/dataset"), CopyConfiguration.builder(fs, state.getProperties()).preserve(PreserveAttributes.fromMnemonicString("")).build()).build();
    CopySource.setWorkUnitGuid(state, Guid.fromHasGuid(copyableFile));
    RecoveryHelper recoveryHelper = new RecoveryHelper(FileSystem.getLocal(new Configuration()), state);
    recoveryHelper.persistFile(state, copyableFile, new Path(file.getAbsolutePath()));
    Assert.assertEquals(stagingDir.listFiles().length, 0);
    Assert.assertEquals(recoveryDir.listFiles().length, 1);
    File fileInRecovery = recoveryDir.listFiles()[0].listFiles()[0];
    Assert.assertEquals(IOUtils.readLines(new FileInputStream(fileInRecovery)).get(0), content);
    Optional<FileStatus> fileToRecover = recoveryHelper.findPersistedFile(state, copyableFile, Predicates.<FileStatus>alwaysTrue());
    Assert.assertTrue(fileToRecover.isPresent());
    Assert.assertEquals(fileToRecover.get().getPath().toUri().getPath(), fileInRecovery.getAbsolutePath());
    fileToRecover = recoveryHelper.findPersistedFile(state, copyableFile, Predicates.<FileStatus>alwaysFalse());
    Assert.assertFalse(fileToRecover.isPresent());
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) Configuration(org.apache.hadoop.conf.Configuration) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FileInputStream(java.io.FileInputStream) State(org.apache.gobblin.configuration.State) FileOutputStream(java.io.FileOutputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) File(java.io.File) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) Test(org.testng.annotations.Test)

Example 7 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class ConfigBasedDatasetTest method testGetCopyableFiles.

@Test
public void testGetCopyableFiles() throws Exception {
    String sourceDir = getClass().getClassLoader().getResource("configBasedDatasetTest/src").getFile();
    String destinationDir = getClass().getClassLoader().getResource("configBasedDatasetTest/dest").getFile();
    long sourceWatermark = 100L;
    Collection<? extends CopyEntity> copyableFiles = testGetCopyableFilesHelper(sourceDir, destinationDir, sourceWatermark, false);
    Assert.assertEquals(copyableFiles.size(), 8);
    copyableFiles = testGetCopyableFilesHelper(sourceDir, destinationDir, sourceWatermark, true);
    Assert.assertEquals(copyableFiles.size(), 6);
    Set<Path> paths = Sets.newHashSet(new Path("dir1/file2"), new Path("dir1/file1"), new Path("dir2/file1"), new Path("dir2/file3"));
    for (CopyEntity copyEntity : copyableFiles) {
        if (copyEntity instanceof CopyableFile) {
            CopyableFile file = (CopyableFile) copyEntity;
            Path originRelativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(file.getOrigin().getPath()), PathUtils.getPathWithoutSchemeAndAuthority(new Path(sourceDir)));
            Path targetRelativePath = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(file.getDestination()), PathUtils.getPathWithoutSchemeAndAuthority(new Path(destinationDir)));
            Assert.assertTrue(paths.contains(originRelativePath));
            Assert.assertTrue(paths.contains(targetRelativePath));
            Assert.assertEquals(originRelativePath, targetRelativePath);
        } else if (copyEntity instanceof PrePublishStep) {
            PrePublishStep pre = (PrePublishStep) copyEntity;
            Assert.assertTrue(pre.getStep() instanceof DeleteFileCommitStep);
            // need to delete this file
            Assert.assertTrue(pre.explain().indexOf("configBasedDatasetTest/dest/dir1/file1") > 0);
        } else if (copyEntity instanceof PostPublishStep) {
            PostPublishStep post = (PostPublishStep) copyEntity;
            Assert.assertTrue(post.getStep() instanceof WatermarkMetadataGenerationCommitStep);
            Assert.assertTrue(post.explain().indexOf("dest/_metadata") > 0 && post.explain().indexOf("" + sourceWatermark) > 0);
        } else {
            throw new Exception("Wrong type");
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep) Test(org.testng.annotations.Test)

Example 8 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class FileAwareInputStreamDataWriterTest method testWrite.

@Test
public void testWrite() throws Exception {
    String streamString = "testContents";
    FileStatus status = fs.getFileStatus(testTempPath);
    OwnerAndPermission ownerAndPermission = new OwnerAndPermission(status.getOwner(), status.getGroup(), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
    CopyableFile cf = CopyableFileUtils.getTestCopyableFile(ownerAndPermission);
    CopyableDatasetMetadata metadata = new CopyableDatasetMetadata(new TestCopyableDataset(new Path("/source")));
    WorkUnitState state = TestUtils.createTestWorkUnitState();
    state.setProp(ConfigurationKeys.WRITER_STAGING_DIR, new Path(testTempPath, "staging").toString());
    state.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, new Path(testTempPath, "output").toString());
    state.setProp(ConfigurationKeys.WRITER_FILE_PATH, RandomStringUtils.randomAlphabetic(5));
    CopySource.serializeCopyEntity(state, cf);
    CopySource.serializeCopyableDataset(state, metadata);
    FileAwareInputStreamDataWriter dataWriter = new FileAwareInputStreamDataWriter(state, 1, 0);
    FileAwareInputStream fileAwareInputStream = new FileAwareInputStream(cf, StreamUtils.convertStream(IOUtils.toInputStream(streamString)));
    dataWriter.write(fileAwareInputStream);
    dataWriter.commit();
    Path writtenFilePath = new Path(new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR), cf.getDatasetAndPartition(metadata).identifier()), cf.getDestination());
    Assert.assertEquals(IOUtils.toString(new FileInputStream(writtenFilePath.toString())), streamString);
}
Also used : TestCopyableDataset(org.apache.gobblin.data.management.copy.TestCopyableDataset) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) OwnerAndPermission(org.apache.gobblin.data.management.copy.OwnerAndPermission) FileAwareInputStream(org.apache.gobblin.data.management.copy.FileAwareInputStream) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) FsPermission(org.apache.hadoop.fs.permission.FsPermission) FileInputStream(java.io.FileInputStream) Test(org.testng.annotations.Test)

Example 9 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class TarArchiveInputStreamDataWriterTest method getCompressedInputStream.

/**
 * Find the test compressed file <code><filePath/code> in classpath and read it as a {@link FileAwareInputStream}
 */
private FileAwareInputStream getCompressedInputStream(final String filePath, final String newFileName) throws Exception {
    UnGzipConverter converter = new UnGzipConverter();
    FileSystem fs = FileSystem.getLocal(new Configuration());
    String fullPath = getClass().getClassLoader().getResource(filePath).getFile();
    FileStatus status = fs.getFileStatus(testTempPath);
    OwnerAndPermission ownerAndPermission = new OwnerAndPermission(status.getOwner(), status.getGroup(), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
    CopyableFile cf = CopyableFileUtils.getTestCopyableFile(filePath, new Path(testTempPath, newFileName).toString(), newFileName, ownerAndPermission);
    FileAwareInputStream fileAwareInputStream = new FileAwareInputStream(cf, fs.open(new Path(fullPath)));
    Iterable<FileAwareInputStream> iterable = converter.convertRecord("outputSchema", fileAwareInputStream, new WorkUnitState());
    return Iterables.getFirst(iterable, null);
}
Also used : Path(org.apache.hadoop.fs.Path) UnGzipConverter(org.apache.gobblin.data.management.copy.converter.UnGzipConverter) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) FileSystem(org.apache.hadoop.fs.FileSystem) OwnerAndPermission(org.apache.gobblin.data.management.copy.OwnerAndPermission) FileAwareInputStream(org.apache.gobblin.data.management.copy.FileAwareInputStream) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) FsPermission(org.apache.hadoop.fs.permission.FsPermission)

Example 10 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class TimestampBasedCopyableDatasetTest method testIsCopyableFile.

/**
 * Test {@link TimestampBasedCopyableDataset.CopyableFileGenerator}'s logic to determine copyable files.
 */
@Test
public void testIsCopyableFile() throws IOException, InterruptedException {
    Path testRoot = new Path("testCopyableFileGenerator");
    Path srcRoot = new Path(testRoot, "datasetRoot");
    String versionDir = "dummyversion";
    Path versionPath = new Path(srcRoot, versionDir);
    Path targetDir = new Path(testRoot, "target");
    if (this.localFs.exists(testRoot)) {
        this.localFs.delete(testRoot, true);
    }
    this.localFs.mkdirs(versionPath);
    Path srcfile = new Path(versionPath, "file1");
    this.localFs.create(srcfile);
    this.localFs.mkdirs(targetDir);
    Properties props = new Properties();
    props.put(TimestampBasedCopyableDataset.COPY_POLICY, TimeBasedCopyPolicyForTest.class.getName());
    props.put(TimestampBasedCopyableDataset.DATASET_VERSION_FINDER, TimestampedDatasetVersionFinderForTest.class.getName());
    Path datasetRootPath = this.localFs.getFileStatus(srcRoot).getPath();
    TimestampBasedCopyableDataset copyabledataset = new TimestampBasedCopyableDataset(localFs, props, datasetRootPath);
    TimestampedDatasetVersion srcVersion = new TimestampedDatasetVersion(new DateTime(), versionPath);
    class SimpleCopyableFileGenerator extends TimestampBasedCopyableDataset.CopyableFileGenerator {

        public SimpleCopyableFileGenerator(TimestampBasedCopyableDataset copyableDataset, FileSystem srcFs, FileSystem targetFs, CopyConfiguration configuration, TimestampedDatasetVersion copyableVersion, ConcurrentLinkedQueue<CopyableFile> copyableFileList) {
            super(srcFs, targetFs, configuration, copyableDataset.datasetRoot(), configuration.getPublishDir(), copyableVersion.getDateTime(), copyableVersion.getPaths(), copyableFileList, copyableDataset.copyableFileFilter());
        }

        @Override
        protected CopyableFile generateCopyableFile(FileStatus singleFile, Path targetPath, long timestampFromPath, Path locationToCopy) throws IOException {
            CopyableFile mockCopyableFile = mock(CopyableFile.class);
            when(mockCopyableFile.getFileSet()).thenReturn(singleFile.getPath().toString());
            return mockCopyableFile;
        }
    }
    // When srcFile exists on src but not on target, srcFile should be included in the copyableFileList.
    CopyConfiguration configuration1 = mock(CopyConfiguration.class);
    when(configuration1.getPublishDir()).thenReturn(localFs.getFileStatus(targetDir).getPath());
    ConcurrentLinkedQueue<CopyableFile> copyableFileList1 = new ConcurrentLinkedQueue<>();
    TimestampBasedCopyableDataset.CopyableFileGenerator copyFileGenerator1 = new SimpleCopyableFileGenerator(copyabledataset, localFs, localFs, configuration1, srcVersion, copyableFileList1);
    copyFileGenerator1.run();
    Assert.assertEquals(copyableFileList1.size(), 1);
    Assert.assertEquals(copyableFileList1.poll().getFileSet(), localFs.getFileStatus(srcfile).getPath().toString());
    // When files exist on both locations but with different timestamp, the result should only include newer src files.
    String noNeedToCopyFile = "file2";
    Path oldSrcFile = new Path(versionPath, noNeedToCopyFile);
    this.localFs.create(oldSrcFile);
    Thread.sleep(100);
    Path newTargetfile = new Path(targetDir, new Path(versionDir, noNeedToCopyFile));
    this.localFs.create(newTargetfile);
    copyFileGenerator1.run();
    Assert.assertEquals(copyableFileList1.size(), 1);
    Assert.assertEquals(copyableFileList1.poll().getFileSet(), localFs.getFileStatus(srcfile).getPath().toString());
    // When srcFile exists on both locations and have the same modified timestamp, it should not be included in copyableFileList.
    CopyConfiguration configuration2 = mock(CopyConfiguration.class);
    when(configuration2.getPublishDir()).thenReturn(localFs.getFileStatus(datasetRootPath).getPath());
    ConcurrentLinkedQueue<CopyableFile> copyableFileList2 = new ConcurrentLinkedQueue<>();
    TimestampBasedCopyableDataset.CopyableFileGenerator copyFileGenerator2 = new SimpleCopyableFileGenerator(copyabledataset, localFs, localFs, configuration2, srcVersion, copyableFileList2);
    copyFileGenerator2.run();
    Assert.assertEquals(copyableFileList2.size(), 0);
    this.localFs.delete(testRoot, true);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) TimestampedDatasetVersion(org.apache.gobblin.data.management.version.TimestampedDatasetVersion) Properties(java.util.Properties) DateTime(org.joda.time.DateTime) FileSystem(org.apache.hadoop.fs.FileSystem) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) Test(org.testng.annotations.Test) BeforeTest(org.testng.annotations.BeforeTest)

Aggregations

CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)20 Path (org.apache.hadoop.fs.Path)15 Test (org.testng.annotations.Test)9 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)8 FileStatus (org.apache.hadoop.fs.FileStatus)7 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)6 CopyConfiguration (org.apache.gobblin.data.management.copy.CopyConfiguration)6 CopyableDatasetMetadata (org.apache.gobblin.data.management.copy.CopyableDatasetMetadata)5 OwnerAndPermission (org.apache.gobblin.data.management.copy.OwnerAndPermission)5 Configuration (org.apache.hadoop.conf.Configuration)5 IOException (java.io.IOException)4 Properties (java.util.Properties)4 FileAwareInputStream (org.apache.gobblin.data.management.copy.FileAwareInputStream)4 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)4 DeleteFileCommitStep (org.apache.gobblin.util.commit.DeleteFileCommitStep)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 FsPermission (org.apache.hadoop.fs.permission.FsPermission)4 FileInputStream (java.io.FileInputStream)3 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)3 TestCopyableDataset (org.apache.gobblin.data.management.copy.TestCopyableDataset)3