Search in sources :

Example 1 with CopyConfiguration

use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.

the class TimestampBasedCopyableDatasetTest method testIsCopyableFile.

/**
 * Test {@link TimestampBasedCopyableDataset.CopyableFileGenerator}'s logic to determine copyable files.
 */
@Test
public void testIsCopyableFile() throws IOException, InterruptedException {
    Path testRoot = new Path("testCopyableFileGenerator");
    Path srcRoot = new Path(testRoot, "datasetRoot");
    String versionDir = "dummyversion";
    Path versionPath = new Path(srcRoot, versionDir);
    Path targetDir = new Path(testRoot, "target");
    if (this.localFs.exists(testRoot)) {
        this.localFs.delete(testRoot, true);
    }
    this.localFs.mkdirs(versionPath);
    Path srcfile = new Path(versionPath, "file1");
    this.localFs.create(srcfile);
    this.localFs.mkdirs(targetDir);
    Properties props = new Properties();
    props.put(TimestampBasedCopyableDataset.COPY_POLICY, TimeBasedCopyPolicyForTest.class.getName());
    props.put(TimestampBasedCopyableDataset.DATASET_VERSION_FINDER, TimestampedDatasetVersionFinderForTest.class.getName());
    Path datasetRootPath = this.localFs.getFileStatus(srcRoot).getPath();
    TimestampBasedCopyableDataset copyabledataset = new TimestampBasedCopyableDataset(localFs, props, datasetRootPath);
    TimestampedDatasetVersion srcVersion = new TimestampedDatasetVersion(new DateTime(), versionPath);
    class SimpleCopyableFileGenerator extends TimestampBasedCopyableDataset.CopyableFileGenerator {

        public SimpleCopyableFileGenerator(TimestampBasedCopyableDataset copyableDataset, FileSystem srcFs, FileSystem targetFs, CopyConfiguration configuration, TimestampedDatasetVersion copyableVersion, ConcurrentLinkedQueue<CopyableFile> copyableFileList) {
            super(srcFs, targetFs, configuration, copyableDataset.datasetRoot(), configuration.getPublishDir(), copyableVersion.getDateTime(), copyableVersion.getPaths(), copyableFileList, copyableDataset.copyableFileFilter());
        }

        @Override
        protected CopyableFile generateCopyableFile(FileStatus singleFile, Path targetPath, long timestampFromPath, Path locationToCopy) throws IOException {
            CopyableFile mockCopyableFile = mock(CopyableFile.class);
            when(mockCopyableFile.getFileSet()).thenReturn(singleFile.getPath().toString());
            return mockCopyableFile;
        }
    }
    // When srcFile exists on src but not on target, srcFile should be included in the copyableFileList.
    CopyConfiguration configuration1 = mock(CopyConfiguration.class);
    when(configuration1.getPublishDir()).thenReturn(localFs.getFileStatus(targetDir).getPath());
    ConcurrentLinkedQueue<CopyableFile> copyableFileList1 = new ConcurrentLinkedQueue<>();
    TimestampBasedCopyableDataset.CopyableFileGenerator copyFileGenerator1 = new SimpleCopyableFileGenerator(copyabledataset, localFs, localFs, configuration1, srcVersion, copyableFileList1);
    copyFileGenerator1.run();
    Assert.assertEquals(copyableFileList1.size(), 1);
    Assert.assertEquals(copyableFileList1.poll().getFileSet(), localFs.getFileStatus(srcfile).getPath().toString());
    // When files exist on both locations but with different timestamp, the result should only include newer src files.
    String noNeedToCopyFile = "file2";
    Path oldSrcFile = new Path(versionPath, noNeedToCopyFile);
    this.localFs.create(oldSrcFile);
    Thread.sleep(100);
    Path newTargetfile = new Path(targetDir, new Path(versionDir, noNeedToCopyFile));
    this.localFs.create(newTargetfile);
    copyFileGenerator1.run();
    Assert.assertEquals(copyableFileList1.size(), 1);
    Assert.assertEquals(copyableFileList1.poll().getFileSet(), localFs.getFileStatus(srcfile).getPath().toString());
    // When srcFile exists on both locations and have the same modified timestamp, it should not be included in copyableFileList.
    CopyConfiguration configuration2 = mock(CopyConfiguration.class);
    when(configuration2.getPublishDir()).thenReturn(localFs.getFileStatus(datasetRootPath).getPath());
    ConcurrentLinkedQueue<CopyableFile> copyableFileList2 = new ConcurrentLinkedQueue<>();
    TimestampBasedCopyableDataset.CopyableFileGenerator copyFileGenerator2 = new SimpleCopyableFileGenerator(copyabledataset, localFs, localFs, configuration2, srcVersion, copyableFileList2);
    copyFileGenerator2.run();
    Assert.assertEquals(copyableFileList2.size(), 0);
    this.localFs.delete(testRoot, true);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) TimestampedDatasetVersion(org.apache.gobblin.data.management.version.TimestampedDatasetVersion) Properties(java.util.Properties) DateTime(org.joda.time.DateTime) FileSystem(org.apache.hadoop.fs.FileSystem) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) Test(org.testng.annotations.Test) BeforeTest(org.testng.annotations.BeforeTest)

Example 2 with CopyConfiguration

use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.

the class ConfigBasedDatasetTest method testGetCopyableFilesHelper.

public Collection<? extends CopyEntity> testGetCopyableFilesHelper(String sourceDir, String destinationDir, long sourceWatermark, boolean isFilterEnabled) throws Exception {
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    URI local = localFs.getUri();
    Properties properties = new Properties();
    properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher");
    PathFilter pathFilter = DatasetUtils.instantiatePathFilter(properties);
    boolean applyFilterToDirectories = false;
    if (isFilterEnabled) {
        properties.setProperty(DatasetUtils.CONFIGURATION_KEY_PREFIX + "path.filter.class", "org.apache.gobblin.util.filters.HiddenFilter");
        properties.setProperty(CopyConfiguration.APPLY_FILTER_TO_DIRECTORIES, "true");
        pathFilter = DatasetUtils.instantiatePathFilter(properties);
        applyFilterToDirectories = Boolean.parseBoolean(properties.getProperty(CopyConfiguration.APPLY_FILTER_TO_DIRECTORIES, "false"));
    }
    CopyConfiguration copyConfiguration = CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).publishDir(new Path(destinationDir)).preserve(PreserveAttributes.fromMnemonicString("ugp")).build();
    ReplicationMetaData mockMetaData = Mockito.mock(ReplicationMetaData.class);
    Mockito.when(mockMetaData.toString()).thenReturn("Mock Meta Data");
    ReplicationConfiguration mockRC = Mockito.mock(ReplicationConfiguration.class);
    Mockito.when(mockRC.getCopyMode()).thenReturn(ReplicationCopyMode.PULL);
    Mockito.when(mockRC.getMetaData()).thenReturn(mockMetaData);
    HadoopFsEndPoint copyFrom = Mockito.mock(HadoopFsEndPoint.class);
    Mockito.when(copyFrom.getDatasetPath()).thenReturn(new Path(sourceDir));
    Mockito.when(copyFrom.getFsURI()).thenReturn(local);
    ComparableWatermark sw = new LongWatermark(sourceWatermark);
    Mockito.when(copyFrom.getWatermark()).thenReturn(Optional.of(sw));
    Mockito.when(copyFrom.getFiles()).thenReturn(FileListUtils.listFilesRecursively(localFs, new Path(sourceDir), pathFilter, applyFilterToDirectories));
    HadoopFsEndPoint copyTo = Mockito.mock(HadoopFsEndPoint.class);
    Mockito.when(copyTo.getDatasetPath()).thenReturn(new Path(destinationDir));
    Mockito.when(copyTo.getFsURI()).thenReturn(local);
    Optional<ComparableWatermark> tmp = Optional.absent();
    Mockito.when(copyTo.getWatermark()).thenReturn(tmp);
    Mockito.when(copyTo.getFiles()).thenReturn(FileListUtils.listFilesRecursively(localFs, new Path(destinationDir), pathFilter, applyFilterToDirectories));
    CopyRoute route = Mockito.mock(CopyRoute.class);
    Mockito.when(route.getCopyFrom()).thenReturn(copyFrom);
    Mockito.when(route.getCopyTo()).thenReturn(copyTo);
    ConfigBasedDataset dataset = new ConfigBasedDataset(mockRC, properties, route);
    Collection<? extends CopyEntity> copyableFiles = dataset.getCopyableFiles(localFs, copyConfiguration);
    return copyableFiles;
}
Also used : Path(org.apache.hadoop.fs.Path) PathFilter(org.apache.hadoop.fs.PathFilter) Configuration(org.apache.hadoop.conf.Configuration) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) Properties(java.util.Properties) URI(java.net.URI) ComparableWatermark(org.apache.gobblin.source.extractor.ComparableWatermark) FileSystem(org.apache.hadoop.fs.FileSystem) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 3 with CopyConfiguration

use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.

the class RegistrationTimeSkipPredicateTest method test.

@Test
public void test() throws Exception {
    Path partition1Path = new Path("/path/to/partition1");
    long modTime = 100000;
    CopyContext copyContext = new CopyContext();
    CopyConfiguration copyConfiguration = Mockito.mock(CopyConfiguration.class);
    Mockito.doReturn(copyContext).when(copyConfiguration).getCopyContext();
    HiveDataset dataset = Mockito.mock(HiveDataset.class);
    FileSystem fs = Mockito.spy(FileSystem.getLocal(new Configuration()));
    FileStatus status = new FileStatus(1, false, 1, 1, modTime, partition1Path);
    Path qualifiedPath = fs.makeQualified(partition1Path);
    Mockito.doReturn(status).when(fs).getFileStatus(qualifiedPath);
    Mockito.doReturn(status).when(fs).getFileStatus(partition1Path);
    Mockito.doReturn(fs).when(dataset).getFs();
    HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
    Mockito.doReturn(copyConfiguration).when(helper).getConfiguration();
    Mockito.doReturn(dataset).when(helper).getDataset();
    RegistrationTimeSkipPredicate predicate = new RegistrationTimeSkipPredicate(helper);
    // partition exists, but registration time before modtime => don't skip
    HivePartitionFileSet pc = createPartitionCopy(partition1Path, modTime - 1, true);
    Assert.assertFalse(predicate.apply(pc));
    // partition exists, registration time equal modtime => don't skip
    pc = createPartitionCopy(partition1Path, modTime, true);
    Assert.assertFalse(predicate.apply(pc));
    // partition exists, registration time larger modtime => do skip
    pc = createPartitionCopy(partition1Path, modTime + 1, true);
    Assert.assertTrue(predicate.apply(pc));
    // partition doesn't exist => don't skip
    pc = createPartitionCopy(partition1Path, modTime + 1, false);
    Assert.assertFalse(predicate.apply(pc));
    // partition exists but is not annotated => don't skip
    pc = createPartitionCopy(partition1Path, modTime + 1, true);
    pc.getExistingTargetPartition().get().getParameters().clear();
    Assert.assertFalse(predicate.apply(pc));
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) CopyContext(org.apache.gobblin.data.management.copy.CopyContext) HivePartitionFileSet(org.apache.gobblin.data.management.copy.hive.HivePartitionFileSet) HiveCopyEntityHelper(org.apache.gobblin.data.management.copy.hive.HiveCopyEntityHelper) Test(org.testng.annotations.Test)

Example 4 with CopyConfiguration

use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.

the class TimestampBasedCopyableDatasetTest method testCopyableFileGenerator.

/**
 * Test {@link TimestampBasedCopyableDataset.CopyableFileGenerator} when src location is empty and also when it is null.
 */
@Test(expectedExceptions = RuntimeException.class)
public void testCopyableFileGenerator() {
    Properties props = new Properties();
    props.put(TimestampBasedCopyableDataset.COPY_POLICY, TimeBasedCopyPolicyForTest.class.getName());
    props.put(TimestampBasedCopyableDataset.DATASET_VERSION_FINDER, TimestampedDatasetVersionFinderForTest.class.getName());
    TimestampBasedCopyableDataset copyabledataset = new TimestampBasedCopyableDataset(localFs, props, new Path("dummy"));
    CopyConfiguration configuration = mock(CopyConfiguration.class);
    when(configuration.getPublishDir()).thenReturn(new Path("publishDir"));
    ConcurrentLinkedQueue<CopyableFile> copyableFileList = new ConcurrentLinkedQueue<>();
    // The src path is empty.
    TimestampedDatasetVersion emptyVersion = new TimestampedDatasetVersion(new DateTime(), new Path("dummy2"));
    TimestampBasedCopyableDataset.CopyableFileGenerator emptyGenerator = copyabledataset.getCopyableFileGenetator(localFs, configuration, emptyVersion, copyableFileList);
    emptyGenerator.run();
    Assert.assertEquals(copyableFileList.size(), 0);
    // The src path is null.
    TimestampedDatasetVersion versionHasNullPath = new TimestampedDatasetVersion(new DateTime(), null);
    TimestampBasedCopyableDataset.CopyableFileGenerator exceptionGenerator = copyabledataset.getCopyableFileGenetator(localFs, configuration, versionHasNullPath, copyableFileList);
    exceptionGenerator.run();
}
Also used : Path(org.apache.hadoop.fs.Path) TimestampedDatasetVersion(org.apache.gobblin.data.management.version.TimestampedDatasetVersion) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) Properties(java.util.Properties) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) DateTime(org.joda.time.DateTime) Test(org.testng.annotations.Test) BeforeTest(org.testng.annotations.BeforeTest)

Example 5 with CopyConfiguration

use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.

the class HiveCopyEntityHelper method getCopyableFilesFromPaths.

/**
 * Get builders for a {@link CopyableFile} for each file referred to by a {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}.
 */
List<CopyableFile.Builder> getCopyableFilesFromPaths(Collection<FileStatus> paths, CopyConfiguration configuration, Optional<Partition> partition) throws IOException {
    List<CopyableFile.Builder> builders = Lists.newArrayList();
    List<SourceAndDestination> dataFiles = Lists.newArrayList();
    Configuration hadoopConfiguration = new Configuration();
    FileSystem actualSourceFs = null;
    String referenceScheme = null;
    String referenceAuthority = null;
    for (FileStatus status : paths) {
        dataFiles.add(new SourceAndDestination(status, getTargetPathHelper().getTargetPath(status.getPath(), this.targetFs, partition, true)));
    }
    for (SourceAndDestination sourceAndDestination : dataFiles) {
        URI uri = sourceAndDestination.getSource().getPath().toUri();
        if (actualSourceFs == null || !StringUtils.equals(referenceScheme, uri.getScheme()) || !StringUtils.equals(referenceAuthority, uri.getAuthority())) {
            actualSourceFs = sourceAndDestination.getSource().getPath().getFileSystem(hadoopConfiguration);
            referenceScheme = uri.getScheme();
            referenceAuthority = uri.getAuthority();
        }
        if (!this.dataset.getTableRootPath().isPresent()) {
            // on the Hive side, and we try to copy a table with a glob location, this logic will have to change.
            throw new IOException(String.format("Table %s does not have a concrete table root path.", this.dataset.getTable().getCompleteName()));
        }
        List<OwnerAndPermission> ancestorOwnerAndPermission = CopyableFile.resolveReplicatedOwnerAndPermissionsRecursively(actualSourceFs, sourceAndDestination.getSource().getPath().getParent(), this.dataset.getTableRootPath().get().getParent(), configuration);
        builders.add(CopyableFile.fromOriginAndDestination(actualSourceFs, sourceAndDestination.getSource(), sourceAndDestination.getDestination(), configuration).ancestorsOwnerAndPermission(ancestorOwnerAndPermission));
    }
    return builders;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) Builder(lombok.Builder) FileSystem(org.apache.hadoop.fs.FileSystem) OwnerAndPermission(org.apache.gobblin.data.management.copy.OwnerAndPermission) ToString(lombok.ToString) IOException(java.io.IOException) URI(java.net.URI)

Aggregations

CopyConfiguration (org.apache.gobblin.data.management.copy.CopyConfiguration)6 FileSystem (org.apache.hadoop.fs.FileSystem)5 Path (org.apache.hadoop.fs.Path)5 Configuration (org.apache.hadoop.conf.Configuration)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 Properties (java.util.Properties)3 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)3 Test (org.testng.annotations.Test)3 URI (java.net.URI)2 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)2 TimestampedDatasetVersion (org.apache.gobblin.data.management.version.TimestampedDatasetVersion)2 DateTime (org.joda.time.DateTime)2 BeforeTest (org.testng.annotations.BeforeTest)2 IOException (java.io.IOException)1 Builder (lombok.Builder)1 ToString (lombok.ToString)1 CopyContext (org.apache.gobblin.data.management.copy.CopyContext)1 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)1 OwnerAndPermission (org.apache.gobblin.data.management.copy.OwnerAndPermission)1 PostPublishStep (org.apache.gobblin.data.management.copy.entities.PostPublishStep)1