use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.
the class RecursiveCopyableDatasetTest method testCopyWithDeleteTargetAndDeleteParentDirectories.
@Test
public void testCopyWithDeleteTargetAndDeleteParentDirectories() throws Exception {
Path source = new Path("/source");
Path target = new Path("/target");
List<FileStatus> sourceFiles = Lists.newArrayList(createFileStatus(source, "file1"));
List<FileStatus> targetFiles = Lists.newArrayList(createFileStatus(target, "file3"));
Properties properties = new Properties();
properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, target.toString());
properties.setProperty(RecursiveCopyableDataset.DELETE_EMPTY_DIRECTORIES_KEY, "true");
properties.setProperty(RecursiveCopyableDataset.DELETE_KEY, "true");
RecursiveCopyableDataset dataset = new TestRecursiveCopyableDataset(source, target, sourceFiles, targetFiles, properties);
Collection<? extends CopyEntity> copyableFiles = dataset.getCopyableFiles(FileSystem.getLocal(new Configuration()), CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).build());
Assert.assertEquals(copyableFiles.size(), 2);
ClassifiedFiles classifiedFiles = classifyFiles(copyableFiles);
Assert.assertTrue(classifiedFiles.getPathsToCopy().containsKey(new Path(source, "file1")));
Assert.assertEquals(classifiedFiles.getPathsToCopy().get(new Path(source, "file1")), new Path(target, "file1"));
Assert.assertEquals(classifiedFiles.getPathsToDelete().size(), 1);
Assert.assertTrue(classifiedFiles.getPathsToDelete().contains(new Path(target, "file3")));
CommitStepCopyEntity entity = (CommitStepCopyEntity) Iterables.filter(copyableFiles, new Predicate<CopyEntity>() {
@Override
public boolean apply(@Nullable CopyEntity copyEntity) {
return copyEntity instanceof CommitStepCopyEntity;
}
}).iterator().next();
DeleteFileCommitStep step = (DeleteFileCommitStep) entity.getStep();
Assert.assertTrue(step.getParentDeletionLimit().isPresent());
Assert.assertEquals(step.getParentDeletionLimit().get(), target);
}
use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.
the class RecursiveCopyableDatasetTest method testCopyWithDeleteTarget.
@Test
public void testCopyWithDeleteTarget() throws Exception {
Path source = new Path("/source");
Path target = new Path("/target");
List<FileStatus> sourceFiles = Lists.newArrayList(createFileStatus(source, "file1"));
List<FileStatus> targetFiles = Lists.newArrayList(createFileStatus(target, "file3"));
Properties properties = new Properties();
properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, target.toString());
properties.setProperty(RecursiveCopyableDataset.DELETE_KEY, "true");
RecursiveCopyableDataset dataset = new TestRecursiveCopyableDataset(source, target, sourceFiles, targetFiles, properties);
Collection<? extends CopyEntity> copyableFiles = dataset.getCopyableFiles(FileSystem.getLocal(new Configuration()), CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).build());
Assert.assertEquals(copyableFiles.size(), 2);
ClassifiedFiles classifiedFiles = classifyFiles(copyableFiles);
Assert.assertTrue(classifiedFiles.getPathsToCopy().containsKey(new Path(source, "file1")));
Assert.assertEquals(classifiedFiles.getPathsToCopy().get(new Path(source, "file1")), new Path(target, "file1"));
Assert.assertEquals(classifiedFiles.getPathsToDelete().size(), 1);
Assert.assertTrue(classifiedFiles.getPathsToDelete().contains(new Path(target, "file3")));
CommitStepCopyEntity entity = (CommitStepCopyEntity) Iterables.filter(copyableFiles, new Predicate<CopyEntity>() {
@Override
public boolean apply(@Nullable CopyEntity copyEntity) {
return copyEntity instanceof CommitStepCopyEntity;
}
}).iterator().next();
DeleteFileCommitStep step = (DeleteFileCommitStep) entity.getStep();
Assert.assertFalse(step.getParentDeletionLimit().isPresent());
}
use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.
the class HiveCopyEntityHelper method addTableDeregisterSteps.
@VisibleForTesting
protected int addTableDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority, Table table) throws IOException {
int stepPriority = initialPriority;
Collection<Path> tablePaths = Lists.newArrayList();
switch(this.getDeleteMethod()) {
case RECURSIVE:
tablePaths = Lists.newArrayList(table.getDataLocation());
break;
case INPUT_FORMAT:
InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(table.getSd());
HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(table.getDataLocation(), inputFormat, this.getTargetFs(), this.getDataset().getProperties());
tablePaths = targetLocation.getPaths().keySet();
break;
case NO_DELETE:
tablePaths = Lists.newArrayList();
break;
default:
tablePaths = Lists.newArrayList();
}
if (!tablePaths.isEmpty()) {
DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.getTargetFs(), tablePaths, this.getDataset().getProperties(), table.getDataLocation());
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deletePaths, stepPriority++));
}
TableDeregisterStep deregister = new TableDeregisterStep(table.getTTable(), this.getTargetURI(), this.getHiveRegProps());
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deregister, stepPriority++));
return stepPriority;
}
use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.
the class ConfigBasedDataset method getCopyableFiles.
@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration copyConfiguration) throws IOException {
List<CopyEntity> copyableFiles = Lists.newArrayList();
EndPoint copyFromRaw = copyRoute.getCopyFrom();
EndPoint copyToRaw = copyRoute.getCopyTo();
if (!(copyFromRaw instanceof HadoopFsEndPoint && copyToRaw instanceof HadoopFsEndPoint)) {
log.warn("Currently only handle the Hadoop Fs EndPoint replication");
return copyableFiles;
}
// For {@link HadoopFsEndPoint}s, set pathfilter and applyFilterToDirectories
HadoopFsEndPoint copyFrom = (HadoopFsEndPoint) copyFromRaw;
HadoopFsEndPoint copyTo = (HadoopFsEndPoint) copyToRaw;
copyFrom.setPathFilter(pathFilter);
copyFrom.setApplyFilterToDirectories(applyFilterToDirectories);
copyTo.setPathFilter(pathFilter);
copyTo.setApplyFilterToDirectories(applyFilterToDirectories);
if (this.watermarkEnabled) {
if ((!copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent()) || (copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent() && copyFromRaw.getWatermark().get().compareTo(copyToRaw.getWatermark().get()) <= 0)) {
log.info("No need to copy as destination watermark >= source watermark with source watermark {}, for dataset with metadata {}", copyFromRaw.getWatermark().isPresent() ? copyFromRaw.getWatermark().get().toJson() : "N/A", this.rc.getMetaData());
return copyableFiles;
}
}
Configuration conf = HadoopUtils.newConfiguration();
FileSystem copyFromFs = FileSystem.get(copyFrom.getFsURI(), conf);
FileSystem copyToFs = FileSystem.get(copyTo.getFsURI(), conf);
Collection<FileStatus> allFilesInSource = copyFrom.getFiles();
Collection<FileStatus> allFilesInTarget = copyTo.getFiles();
Set<FileStatus> copyFromFileStatuses = Sets.newHashSet(allFilesInSource);
Map<Path, FileStatus> copyToFileMap = Maps.newHashMap();
for (FileStatus f : allFilesInTarget) {
copyToFileMap.put(PathUtils.getPathWithoutSchemeAndAuthority(f.getPath()), f);
}
Collection<Path> deletedPaths = Lists.newArrayList();
boolean watermarkMetadataCopied = false;
boolean deleteTargetIfNotExistOnSource = rc.isDeleteTargetIfNotExistOnSource();
for (FileStatus originFileStatus : copyFromFileStatuses) {
Path relative = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(originFileStatus.getPath()), PathUtils.getPathWithoutSchemeAndAuthority(copyFrom.getDatasetPath()));
// construct the new path in the target file system
Path newPath = new Path(copyTo.getDatasetPath(), relative);
if (relative.toString().equals(ReplicaHadoopFsEndPoint.WATERMARK_FILE)) {
watermarkMetadataCopied = true;
}
// skip copy same file
if (copyToFileMap.containsKey(newPath) && copyToFileMap.get(newPath).getLen() == originFileStatus.getLen() && copyToFileMap.get(newPath).getModificationTime() > originFileStatus.getModificationTime()) {
log.debug("Copy from timestamp older than copy to timestamp, skipped copy {} for dataset with metadata {}", originFileStatus.getPath(), this.rc.getMetaData());
} else {
// need to remove those files in the target File System
if (copyToFileMap.containsKey(newPath)) {
deletedPaths.add(newPath);
}
CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration).fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()).build();
copyableFile.setFsDatasets(copyFromFs, copyToFs);
copyableFiles.add(copyableFile);
}
// clean up already checked paths
copyToFileMap.remove(newPath);
}
// delete the paths on target directory if NOT exists on source
if (deleteTargetIfNotExistOnSource) {
deletedPaths.addAll(copyToFileMap.keySet());
}
// delete old files first
if (!deletedPaths.isEmpty()) {
DeleteFileCommitStep deleteCommitStep = DeleteFileCommitStep.fromPaths(copyToFs, deletedPaths, this.props);
copyableFiles.add(new PrePublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), deleteCommitStep, 0));
}
// generate the watermark file even if watermark checking is disabled. Make sure it can come into functional once disired.
if ((!watermarkMetadataCopied) && copyFrom.getWatermark().isPresent()) {
copyableFiles.add(new PostPublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), new WatermarkMetadataGenerationCommitStep(copyTo.getFsURI().toString(), copyTo.getDatasetPath(), copyFrom.getWatermark().get()), 1));
}
return copyableFiles;
}
use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.
the class RecursiveCopyableDataset method getCopyableFiles.
@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration configuration) throws IOException {
Path nonGlobSearchPath = PathUtils.deepestNonGlobPath(this.glob);
Path targetPath = new Path(configuration.getPublishDir(), PathUtils.relativizePath(this.rootPath, nonGlobSearchPath));
Map<Path, FileStatus> filesInSource = createPathMap(getFilesAtPath(this.fs, this.rootPath, this.pathFilter), this.rootPath);
Map<Path, FileStatus> filesInTarget = createPathMap(getFilesAtPath(targetFs, targetPath, this.pathFilter), targetPath);
List<Path> toCopy = Lists.newArrayList();
Map<Path, FileStatus> toDelete = Maps.newHashMap();
boolean requiresUpdate = false;
for (Map.Entry<Path, FileStatus> entry : filesInSource.entrySet()) {
FileStatus statusInTarget = filesInTarget.remove(entry.getKey());
if (statusInTarget != null) {
// in both
if (!sameFile(filesInSource.get(entry.getKey()), statusInTarget)) {
toCopy.add(entry.getKey());
toDelete.put(entry.getKey(), statusInTarget);
requiresUpdate = true;
}
} else {
toCopy.add(entry.getKey());
}
}
if (!this.update && requiresUpdate) {
throw new IOException("Some files need to be copied but they already exist in the destination. " + "Aborting because not running in update mode.");
}
if (this.delete) {
toDelete.putAll(filesInTarget);
}
List<CopyEntity> copyEntities = Lists.newArrayList();
List<CopyableFile> copyableFiles = Lists.newArrayList();
for (Path path : toCopy) {
FileStatus file = filesInSource.get(path);
Path filePathRelativeToSearchPath = PathUtils.relativizePath(file.getPath(), nonGlobSearchPath);
Path thisTargetPath = new Path(configuration.getPublishDir(), filePathRelativeToSearchPath);
CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(this.fs, file, thisTargetPath, configuration).fileSet(datasetURN()).datasetOutputPath(thisTargetPath.toString()).ancestorsOwnerAndPermission(CopyableFile.resolveReplicatedOwnerAndPermissionsRecursively(this.fs, file.getPath().getParent(), nonGlobSearchPath, configuration)).build();
copyableFile.setFsDatasets(this.fs, targetFs);
copyableFiles.add(copyableFile);
}
copyEntities.addAll(this.copyableFileFilter.filter(this.fs, targetFs, copyableFiles));
if (!toDelete.isEmpty()) {
CommitStep step = new DeleteFileCommitStep(targetFs, toDelete.values(), this.properties, this.deleteEmptyDirectories ? Optional.of(targetPath) : Optional.<Path>absent());
copyEntities.add(new PrePublishStep(datasetURN(), Maps.<String, String>newHashMap(), step, 1));
}
return copyEntities;
}
Aggregations