use of org.apache.gobblin.data.management.copy.CopyConfiguration in project incubator-gobblin by apache.
the class ConfigBasedDataset method getCopyableFiles.
@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration copyConfiguration) throws IOException {
List<CopyEntity> copyableFiles = Lists.newArrayList();
EndPoint copyFromRaw = copyRoute.getCopyFrom();
EndPoint copyToRaw = copyRoute.getCopyTo();
if (!(copyFromRaw instanceof HadoopFsEndPoint && copyToRaw instanceof HadoopFsEndPoint)) {
log.warn("Currently only handle the Hadoop Fs EndPoint replication");
return copyableFiles;
}
// For {@link HadoopFsEndPoint}s, set pathfilter and applyFilterToDirectories
HadoopFsEndPoint copyFrom = (HadoopFsEndPoint) copyFromRaw;
HadoopFsEndPoint copyTo = (HadoopFsEndPoint) copyToRaw;
copyFrom.setPathFilter(pathFilter);
copyFrom.setApplyFilterToDirectories(applyFilterToDirectories);
copyTo.setPathFilter(pathFilter);
copyTo.setApplyFilterToDirectories(applyFilterToDirectories);
if (this.watermarkEnabled) {
if ((!copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent()) || (copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent() && copyFromRaw.getWatermark().get().compareTo(copyToRaw.getWatermark().get()) <= 0)) {
log.info("No need to copy as destination watermark >= source watermark with source watermark {}, for dataset with metadata {}", copyFromRaw.getWatermark().isPresent() ? copyFromRaw.getWatermark().get().toJson() : "N/A", this.rc.getMetaData());
return copyableFiles;
}
}
Configuration conf = HadoopUtils.newConfiguration();
FileSystem copyFromFs = FileSystem.get(copyFrom.getFsURI(), conf);
FileSystem copyToFs = FileSystem.get(copyTo.getFsURI(), conf);
Collection<FileStatus> allFilesInSource = copyFrom.getFiles();
Collection<FileStatus> allFilesInTarget = copyTo.getFiles();
Set<FileStatus> copyFromFileStatuses = Sets.newHashSet(allFilesInSource);
Map<Path, FileStatus> copyToFileMap = Maps.newHashMap();
for (FileStatus f : allFilesInTarget) {
copyToFileMap.put(PathUtils.getPathWithoutSchemeAndAuthority(f.getPath()), f);
}
Collection<Path> deletedPaths = Lists.newArrayList();
boolean watermarkMetadataCopied = false;
boolean deleteTargetIfNotExistOnSource = rc.isDeleteTargetIfNotExistOnSource();
for (FileStatus originFileStatus : copyFromFileStatuses) {
Path relative = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(originFileStatus.getPath()), PathUtils.getPathWithoutSchemeAndAuthority(copyFrom.getDatasetPath()));
// construct the new path in the target file system
Path newPath = new Path(copyTo.getDatasetPath(), relative);
if (relative.toString().equals(ReplicaHadoopFsEndPoint.WATERMARK_FILE)) {
watermarkMetadataCopied = true;
}
// skip copy same file
if (copyToFileMap.containsKey(newPath) && copyToFileMap.get(newPath).getLen() == originFileStatus.getLen() && copyToFileMap.get(newPath).getModificationTime() > originFileStatus.getModificationTime()) {
log.debug("Copy from timestamp older than copy to timestamp, skipped copy {} for dataset with metadata {}", originFileStatus.getPath(), this.rc.getMetaData());
} else {
// need to remove those files in the target File System
if (copyToFileMap.containsKey(newPath)) {
deletedPaths.add(newPath);
}
CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration).fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()).build();
copyableFile.setFsDatasets(copyFromFs, copyToFs);
copyableFiles.add(copyableFile);
}
// clean up already checked paths
copyToFileMap.remove(newPath);
}
// delete the paths on target directory if NOT exists on source
if (deleteTargetIfNotExistOnSource) {
deletedPaths.addAll(copyToFileMap.keySet());
}
// delete old files first
if (!deletedPaths.isEmpty()) {
DeleteFileCommitStep deleteCommitStep = DeleteFileCommitStep.fromPaths(copyToFs, deletedPaths, this.props);
copyableFiles.add(new PrePublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), deleteCommitStep, 0));
}
// generate the watermark file even if watermark checking is disabled. Make sure it can come into functional once disired.
if ((!watermarkMetadataCopied) && copyFrom.getWatermark().isPresent()) {
copyableFiles.add(new PostPublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), new WatermarkMetadataGenerationCommitStep(copyTo.getFsURI().toString(), copyTo.getDatasetPath(), copyFrom.getWatermark().get()), 1));
}
return copyableFiles;
}
Aggregations