use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.
the class TimestampBasedCopyableDatasetTest method testIsCopyableFile.
/**
* Test {@link TimestampBasedCopyableDataset.CopyableFileGenerator}'s logic to determine copyable files.
*/
@Test
public void testIsCopyableFile() throws IOException, InterruptedException {
Path testRoot = new Path("testCopyableFileGenerator");
Path srcRoot = new Path(testRoot, "datasetRoot");
String versionDir = "dummyversion";
Path versionPath = new Path(srcRoot, versionDir);
Path targetDir = new Path(testRoot, "target");
if (this.localFs.exists(testRoot)) {
this.localFs.delete(testRoot, true);
}
this.localFs.mkdirs(versionPath);
Path srcfile = new Path(versionPath, "file1");
this.localFs.create(srcfile);
this.localFs.mkdirs(targetDir);
Properties props = new Properties();
props.put(TimestampBasedCopyableDataset.COPY_POLICY, TimeBasedCopyPolicyForTest.class.getName());
props.put(TimestampBasedCopyableDataset.DATASET_VERSION_FINDER, TimestampedDatasetVersionFinderForTest.class.getName());
Path datasetRootPath = this.localFs.getFileStatus(srcRoot).getPath();
TimestampBasedCopyableDataset copyabledataset = new TimestampBasedCopyableDataset(localFs, props, datasetRootPath);
TimestampedDatasetVersion srcVersion = new TimestampedDatasetVersion(new DateTime(), versionPath);
class SimpleCopyableFileGenerator extends TimestampBasedCopyableDataset.CopyableFileGenerator {
public SimpleCopyableFileGenerator(TimestampBasedCopyableDataset copyableDataset, FileSystem srcFs, FileSystem targetFs, CopyConfiguration configuration, TimestampedDatasetVersion copyableVersion, ConcurrentLinkedQueue<CopyableFile> copyableFileList) {
super(srcFs, targetFs, configuration, copyableDataset.datasetRoot(), configuration.getPublishDir(), copyableVersion.getDateTime(), copyableVersion.getPaths(), copyableFileList, copyableDataset.copyableFileFilter());
}
@Override
protected CopyableFile generateCopyableFile(FileStatus singleFile, Path targetPath, long timestampFromPath, Path locationToCopy) throws IOException {
CopyableFile mockCopyableFile = mock(CopyableFile.class);
when(mockCopyableFile.getFileSet()).thenReturn(singleFile.getPath().toString());
return mockCopyableFile;
}
}
// When srcFile exists on src but not on target, srcFile should be included in the copyableFileList.
CopyConfiguration configuration1 = mock(CopyConfiguration.class);
when(configuration1.getPublishDir()).thenReturn(localFs.getFileStatus(targetDir).getPath());
ConcurrentLinkedQueue<CopyableFile> copyableFileList1 = new ConcurrentLinkedQueue<>();
TimestampBasedCopyableDataset.CopyableFileGenerator copyFileGenerator1 = new SimpleCopyableFileGenerator(copyabledataset, localFs, localFs, configuration1, srcVersion, copyableFileList1);
copyFileGenerator1.run();
Assert.assertEquals(copyableFileList1.size(), 1);
Assert.assertEquals(copyableFileList1.poll().getFileSet(), localFs.getFileStatus(srcfile).getPath().toString());
// When files exist on both locations but with different timestamp, the result should only include newer src files.
String noNeedToCopyFile = "file2";
Path oldSrcFile = new Path(versionPath, noNeedToCopyFile);
this.localFs.create(oldSrcFile);
Thread.sleep(100);
Path newTargetfile = new Path(targetDir, new Path(versionDir, noNeedToCopyFile));
this.localFs.create(newTargetfile);
copyFileGenerator1.run();
Assert.assertEquals(copyableFileList1.size(), 1);
Assert.assertEquals(copyableFileList1.poll().getFileSet(), localFs.getFileStatus(srcfile).getPath().toString());
// When srcFile exists on both locations and have the same modified timestamp, it should not be included in copyableFileList.
CopyConfiguration configuration2 = mock(CopyConfiguration.class);
when(configuration2.getPublishDir()).thenReturn(localFs.getFileStatus(datasetRootPath).getPath());
ConcurrentLinkedQueue<CopyableFile> copyableFileList2 = new ConcurrentLinkedQueue<>();
TimestampBasedCopyableDataset.CopyableFileGenerator copyFileGenerator2 = new SimpleCopyableFileGenerator(copyabledataset, localFs, localFs, configuration2, srcVersion, copyableFileList2);
copyFileGenerator2.run();
Assert.assertEquals(copyableFileList2.size(), 0);
this.localFs.delete(testRoot, true);
}
use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.
the class ConfigBasedDatasetTest method testGetCopyableFilesHelper.
public Collection<? extends CopyEntity> testGetCopyableFilesHelper(String sourceDir, String destinationDir, long sourceWatermark, boolean isFilterEnabled) throws Exception {
FileSystem localFs = FileSystem.getLocal(new Configuration());
URI local = localFs.getUri();
Properties properties = new Properties();
properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher");
PathFilter pathFilter = DatasetUtils.instantiatePathFilter(properties);
boolean applyFilterToDirectories = false;
if (isFilterEnabled) {
properties.setProperty(DatasetUtils.CONFIGURATION_KEY_PREFIX + "path.filter.class", "org.apache.gobblin.util.filters.HiddenFilter");
properties.setProperty(CopyConfiguration.APPLY_FILTER_TO_DIRECTORIES, "true");
pathFilter = DatasetUtils.instantiatePathFilter(properties);
applyFilterToDirectories = Boolean.parseBoolean(properties.getProperty(CopyConfiguration.APPLY_FILTER_TO_DIRECTORIES, "false"));
}
CopyConfiguration copyConfiguration = CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).publishDir(new Path(destinationDir)).preserve(PreserveAttributes.fromMnemonicString("ugp")).build();
ReplicationMetaData mockMetaData = Mockito.mock(ReplicationMetaData.class);
Mockito.when(mockMetaData.toString()).thenReturn("Mock Meta Data");
ReplicationConfiguration mockRC = Mockito.mock(ReplicationConfiguration.class);
Mockito.when(mockRC.getCopyMode()).thenReturn(ReplicationCopyMode.PULL);
Mockito.when(mockRC.getMetaData()).thenReturn(mockMetaData);
HadoopFsEndPoint copyFrom = Mockito.mock(HadoopFsEndPoint.class);
Mockito.when(copyFrom.getDatasetPath()).thenReturn(new Path(sourceDir));
Mockito.when(copyFrom.getFsURI()).thenReturn(local);
ComparableWatermark sw = new LongWatermark(sourceWatermark);
Mockito.when(copyFrom.getWatermark()).thenReturn(Optional.of(sw));
Mockito.when(copyFrom.getFiles()).thenReturn(FileListUtils.listFilesRecursively(localFs, new Path(sourceDir), pathFilter, applyFilterToDirectories));
HadoopFsEndPoint copyTo = Mockito.mock(HadoopFsEndPoint.class);
Mockito.when(copyTo.getDatasetPath()).thenReturn(new Path(destinationDir));
Mockito.when(copyTo.getFsURI()).thenReturn(local);
Optional<ComparableWatermark> tmp = Optional.absent();
Mockito.when(copyTo.getWatermark()).thenReturn(tmp);
Mockito.when(copyTo.getFiles()).thenReturn(FileListUtils.listFilesRecursively(localFs, new Path(destinationDir), pathFilter, applyFilterToDirectories));
CopyRoute route = Mockito.mock(CopyRoute.class);
Mockito.when(route.getCopyFrom()).thenReturn(copyFrom);
Mockito.when(route.getCopyTo()).thenReturn(copyTo);
ConfigBasedDataset dataset = new ConfigBasedDataset(mockRC, properties, route);
Collection<? extends CopyEntity> copyableFiles = dataset.getCopyableFiles(localFs, copyConfiguration);
return copyableFiles;
}
use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.
the class RegistrationTimeSkipPredicateTest method test.
@Test
public void test() throws Exception {
Path partition1Path = new Path("/path/to/partition1");
long modTime = 100000;
CopyContext copyContext = new CopyContext();
CopyConfiguration copyConfiguration = Mockito.mock(CopyConfiguration.class);
Mockito.doReturn(copyContext).when(copyConfiguration).getCopyContext();
HiveDataset dataset = Mockito.mock(HiveDataset.class);
FileSystem fs = Mockito.spy(FileSystem.getLocal(new Configuration()));
FileStatus status = new FileStatus(1, false, 1, 1, modTime, partition1Path);
Path qualifiedPath = fs.makeQualified(partition1Path);
Mockito.doReturn(status).when(fs).getFileStatus(qualifiedPath);
Mockito.doReturn(status).when(fs).getFileStatus(partition1Path);
Mockito.doReturn(fs).when(dataset).getFs();
HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class);
Mockito.doReturn(copyConfiguration).when(helper).getConfiguration();
Mockito.doReturn(dataset).when(helper).getDataset();
RegistrationTimeSkipPredicate predicate = new RegistrationTimeSkipPredicate(helper);
// partition exists, but registration time before modtime => don't skip
HivePartitionFileSet pc = createPartitionCopy(partition1Path, modTime - 1, true);
Assert.assertFalse(predicate.apply(pc));
// partition exists, registration time equal modtime => don't skip
pc = createPartitionCopy(partition1Path, modTime, true);
Assert.assertFalse(predicate.apply(pc));
// partition exists, registration time larger modtime => do skip
pc = createPartitionCopy(partition1Path, modTime + 1, true);
Assert.assertTrue(predicate.apply(pc));
// partition doesn't exist => don't skip
pc = createPartitionCopy(partition1Path, modTime + 1, false);
Assert.assertFalse(predicate.apply(pc));
// partition exists but is not annotated => don't skip
pc = createPartitionCopy(partition1Path, modTime + 1, true);
pc.getExistingTargetPartition().get().getParameters().clear();
Assert.assertFalse(predicate.apply(pc));
}
use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.
the class TimestampBasedCopyableDatasetTest method testCopyableFileGenerator.
/**
* Test {@link TimestampBasedCopyableDataset.CopyableFileGenerator} when src location is empty and also when it is null.
*/
@Test(expectedExceptions = RuntimeException.class)
public void testCopyableFileGenerator() {
Properties props = new Properties();
props.put(TimestampBasedCopyableDataset.COPY_POLICY, TimeBasedCopyPolicyForTest.class.getName());
props.put(TimestampBasedCopyableDataset.DATASET_VERSION_FINDER, TimestampedDatasetVersionFinderForTest.class.getName());
TimestampBasedCopyableDataset copyabledataset = new TimestampBasedCopyableDataset(localFs, props, new Path("dummy"));
CopyConfiguration configuration = mock(CopyConfiguration.class);
when(configuration.getPublishDir()).thenReturn(new Path("publishDir"));
ConcurrentLinkedQueue<CopyableFile> copyableFileList = new ConcurrentLinkedQueue<>();
// The src path is empty.
TimestampedDatasetVersion emptyVersion = new TimestampedDatasetVersion(new DateTime(), new Path("dummy2"));
TimestampBasedCopyableDataset.CopyableFileGenerator emptyGenerator = copyabledataset.getCopyableFileGenetator(localFs, configuration, emptyVersion, copyableFileList);
emptyGenerator.run();
Assert.assertEquals(copyableFileList.size(), 0);
// The src path is null.
TimestampedDatasetVersion versionHasNullPath = new TimestampedDatasetVersion(new DateTime(), null);
TimestampBasedCopyableDataset.CopyableFileGenerator exceptionGenerator = copyabledataset.getCopyableFileGenetator(localFs, configuration, versionHasNullPath, copyableFileList);
exceptionGenerator.run();
}
use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.
the class HiveCopyEntityHelper method getCopyableFilesFromPaths.
/**
* Get builders for a {@link CopyableFile} for each file referred to by a {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}.
*/
List<CopyableFile.Builder> getCopyableFilesFromPaths(Collection<FileStatus> paths, CopyConfiguration configuration, Optional<Partition> partition) throws IOException {
List<CopyableFile.Builder> builders = Lists.newArrayList();
List<SourceAndDestination> dataFiles = Lists.newArrayList();
Configuration hadoopConfiguration = new Configuration();
FileSystem actualSourceFs = null;
String referenceScheme = null;
String referenceAuthority = null;
for (FileStatus status : paths) {
dataFiles.add(new SourceAndDestination(status, getTargetPathHelper().getTargetPath(status.getPath(), this.targetFs, partition, true)));
}
for (SourceAndDestination sourceAndDestination : dataFiles) {
URI uri = sourceAndDestination.getSource().getPath().toUri();
if (actualSourceFs == null || !StringUtils.equals(referenceScheme, uri.getScheme()) || !StringUtils.equals(referenceAuthority, uri.getAuthority())) {
actualSourceFs = sourceAndDestination.getSource().getPath().getFileSystem(hadoopConfiguration);
referenceScheme = uri.getScheme();
referenceAuthority = uri.getAuthority();
}
if (!this.dataset.getTableRootPath().isPresent()) {
// on the Hive side, and we try to copy a table with a glob location, this logic will have to change.
throw new IOException(String.format("Table %s does not have a concrete table root path.", this.dataset.getTable().getCompleteName()));
}
List<OwnerAndPermission> ancestorOwnerAndPermission = CopyableFile.resolveReplicatedOwnerAndPermissionsRecursively(actualSourceFs, sourceAndDestination.getSource().getPath().getParent(), this.dataset.getTableRootPath().get().getParent(), configuration);
builders.add(CopyableFile.fromOriginAndDestination(actualSourceFs, sourceAndDestination.getSource(), sourceAndDestination.getDestination(), configuration).ancestorsOwnerAndPermission(ancestorOwnerAndPermission));
}
return builders;
}
Aggregations