use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class ConfigBasedDataset method getCopyableFiles.
@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration copyConfiguration) throws IOException {
List<CopyEntity> copyableFiles = Lists.newArrayList();
EndPoint copyFromRaw = copyRoute.getCopyFrom();
EndPoint copyToRaw = copyRoute.getCopyTo();
if (!(copyFromRaw instanceof HadoopFsEndPoint && copyToRaw instanceof HadoopFsEndPoint)) {
log.warn("Currently only handle the Hadoop Fs EndPoint replication");
return copyableFiles;
}
// For {@link HadoopFsEndPoint}s, set pathfilter and applyFilterToDirectories
HadoopFsEndPoint copyFrom = (HadoopFsEndPoint) copyFromRaw;
HadoopFsEndPoint copyTo = (HadoopFsEndPoint) copyToRaw;
copyFrom.setPathFilter(pathFilter);
copyFrom.setApplyFilterToDirectories(applyFilterToDirectories);
copyTo.setPathFilter(pathFilter);
copyTo.setApplyFilterToDirectories(applyFilterToDirectories);
if (this.watermarkEnabled) {
if ((!copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent()) || (copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent() && copyFromRaw.getWatermark().get().compareTo(copyToRaw.getWatermark().get()) <= 0)) {
log.info("No need to copy as destination watermark >= source watermark with source watermark {}, for dataset with metadata {}", copyFromRaw.getWatermark().isPresent() ? copyFromRaw.getWatermark().get().toJson() : "N/A", this.rc.getMetaData());
return copyableFiles;
}
}
Configuration conf = HadoopUtils.newConfiguration();
FileSystem copyFromFs = FileSystem.get(copyFrom.getFsURI(), conf);
FileSystem copyToFs = FileSystem.get(copyTo.getFsURI(), conf);
Collection<FileStatus> allFilesInSource = copyFrom.getFiles();
Collection<FileStatus> allFilesInTarget = copyTo.getFiles();
Set<FileStatus> copyFromFileStatuses = Sets.newHashSet(allFilesInSource);
Map<Path, FileStatus> copyToFileMap = Maps.newHashMap();
for (FileStatus f : allFilesInTarget) {
copyToFileMap.put(PathUtils.getPathWithoutSchemeAndAuthority(f.getPath()), f);
}
Collection<Path> deletedPaths = Lists.newArrayList();
boolean watermarkMetadataCopied = false;
boolean deleteTargetIfNotExistOnSource = rc.isDeleteTargetIfNotExistOnSource();
for (FileStatus originFileStatus : copyFromFileStatuses) {
Path relative = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(originFileStatus.getPath()), PathUtils.getPathWithoutSchemeAndAuthority(copyFrom.getDatasetPath()));
// construct the new path in the target file system
Path newPath = new Path(copyTo.getDatasetPath(), relative);
if (relative.toString().equals(ReplicaHadoopFsEndPoint.WATERMARK_FILE)) {
watermarkMetadataCopied = true;
}
// skip copy same file
if (copyToFileMap.containsKey(newPath) && copyToFileMap.get(newPath).getLen() == originFileStatus.getLen() && copyToFileMap.get(newPath).getModificationTime() > originFileStatus.getModificationTime()) {
log.debug("Copy from timestamp older than copy to timestamp, skipped copy {} for dataset with metadata {}", originFileStatus.getPath(), this.rc.getMetaData());
} else {
// need to remove those files in the target File System
if (copyToFileMap.containsKey(newPath)) {
deletedPaths.add(newPath);
}
CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration).fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()).build();
copyableFile.setFsDatasets(copyFromFs, copyToFs);
copyableFiles.add(copyableFile);
}
// clean up already checked paths
copyToFileMap.remove(newPath);
}
// delete the paths on target directory if NOT exists on source
if (deleteTargetIfNotExistOnSource) {
deletedPaths.addAll(copyToFileMap.keySet());
}
// delete old files first
if (!deletedPaths.isEmpty()) {
DeleteFileCommitStep deleteCommitStep = DeleteFileCommitStep.fromPaths(copyToFs, deletedPaths, this.props);
copyableFiles.add(new PrePublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), deleteCommitStep, 0));
}
// generate the watermark file even if watermark checking is disabled. Make sure it can come into functional once disired.
if ((!watermarkMetadataCopied) && copyFrom.getWatermark().isPresent()) {
copyableFiles.add(new PostPublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), new WatermarkMetadataGenerationCommitStep(copyTo.getFsURI().toString(), copyTo.getDatasetPath(), copyFrom.getWatermark().get()), 1));
}
return copyableFiles;
}
use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class CopyDataPublisher method persistFailedFileSet.
private int persistFailedFileSet(Collection<? extends WorkUnitState> workUnitStates) throws IOException {
int filesPersisted = 0;
for (WorkUnitState wu : workUnitStates) {
if (wu.getWorkingState() == WorkingState.SUCCESSFUL) {
CopyEntity entity = CopySource.deserializeCopyEntity(wu);
if (entity instanceof CopyableFile) {
CopyableFile file = (CopyableFile) entity;
Path outputDir = FileAwareInputStreamDataWriter.getOutputDir(wu);
CopyableDatasetMetadata metadata = CopySource.deserializeCopyableDataset(wu);
Path outputPath = FileAwareInputStreamDataWriter.getOutputFilePath(file, outputDir, file.getDatasetAndPartition(metadata));
if (this.recoveryHelper.persistFile(wu, file, outputPath)) {
filesPersisted++;
}
}
}
}
return filesPersisted;
}
use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class CopyDataPublisher method publishFileSet.
/**
* Publish data for a {@link CopyableDataset}.
*/
private void publishFileSet(CopyEntity.DatasetAndPartition datasetAndPartition, Collection<WorkUnitState> datasetWorkUnitStates) throws IOException {
Map<String, String> additionalMetadata = Maps.newHashMap();
Preconditions.checkArgument(!datasetWorkUnitStates.isEmpty(), "publishFileSet received an empty collection work units. This is an error in code.");
CopyableDatasetMetadata metadata = CopyableDatasetMetadata.deserialize(datasetWorkUnitStates.iterator().next().getProp(CopySource.SERIALIZED_COPYABLE_DATASET));
Path datasetWriterOutputPath = new Path(this.writerOutputDir, datasetAndPartition.identifier());
log.info(String.format("[%s] Publishing fileSet from %s for dataset %s", datasetAndPartition.identifier(), datasetWriterOutputPath, metadata.getDatasetURN()));
List<CommitStep> prePublish = getCommitSequence(datasetWorkUnitStates, PrePublishStep.class);
List<CommitStep> postPublish = getCommitSequence(datasetWorkUnitStates, PostPublishStep.class);
log.info(String.format("[%s] Found %d prePublish steps and %d postPublish steps.", datasetAndPartition.identifier(), prePublish.size(), postPublish.size()));
executeCommitSequence(prePublish);
if (hasCopyableFiles(datasetWorkUnitStates)) {
// Targets are always absolute, so we start moving from root (will skip any existing directories).
HadoopUtils.renameRecursively(this.fs, datasetWriterOutputPath, new Path("/"));
} else {
log.info(String.format("[%s] No copyable files in dataset. Proceeding to postpublish steps.", datasetAndPartition.identifier()));
}
executeCommitSequence(postPublish);
this.fs.delete(datasetWriterOutputPath, true);
long datasetOriginTimestamp = Long.MAX_VALUE;
long datasetUpstreamTimestamp = Long.MAX_VALUE;
Optional<String> fileSetRoot = Optional.<String>absent();
for (WorkUnitState wus : datasetWorkUnitStates) {
if (wus.getWorkingState() == WorkingState.SUCCESSFUL) {
wus.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
}
CopyEntity copyEntity = CopySource.deserializeCopyEntity(wus);
if (copyEntity instanceof CopyableFile) {
CopyableFile copyableFile = (CopyableFile) copyEntity;
if (wus.getWorkingState() == WorkingState.COMMITTED) {
CopyEventSubmitterHelper.submitSuccessfulFilePublish(this.eventSubmitter, copyableFile, wus);
// Currently datasetOutputPath is only present for hive datasets.
if (!fileSetRoot.isPresent() && copyableFile.getDatasetOutputPath() != null) {
fileSetRoot = Optional.of(copyableFile.getDatasetOutputPath());
}
if (lineageInfo.isPresent()) {
lineageInfo.get().putDestination(copyableFile.getDestinationDataset(), 0, wus);
}
}
if (datasetOriginTimestamp > copyableFile.getOriginTimestamp()) {
datasetOriginTimestamp = copyableFile.getOriginTimestamp();
}
if (datasetUpstreamTimestamp > copyableFile.getUpstreamTimestamp()) {
datasetUpstreamTimestamp = copyableFile.getUpstreamTimestamp();
}
}
}
// something more readable
if (Long.MAX_VALUE == datasetOriginTimestamp) {
datasetOriginTimestamp = 0;
}
if (Long.MAX_VALUE == datasetUpstreamTimestamp) {
datasetUpstreamTimestamp = 0;
}
additionalMetadata.put(SlaEventKeys.SOURCE_URI, this.state.getProp(SlaEventKeys.SOURCE_URI));
additionalMetadata.put(SlaEventKeys.DESTINATION_URI, this.state.getProp(SlaEventKeys.DESTINATION_URI));
additionalMetadata.put(SlaEventKeys.DATASET_OUTPUT_PATH, fileSetRoot.or("Unknown"));
CopyEventSubmitterHelper.submitSuccessfulDatasetPublish(this.eventSubmitter, datasetAndPartition, Long.toString(datasetOriginTimestamp), Long.toString(datasetUpstreamTimestamp), additionalMetadata);
}
use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class TimestampBasedCopyableDataset method getCopyableFiles.
@Override
public Collection<CopyableFile> getCopyableFiles(FileSystem targetFs, CopyConfiguration configuration) throws IOException {
log.info(String.format("Getting copyable files at root path: %s", this.datasetRoot));
List<TimestampedDatasetVersion> versions = Lists.newArrayList(this.datasetVersionFinder.findDatasetVersions(this));
if (versions.isEmpty()) {
log.warn("No dataset version can be found. Ignoring.");
return Lists.newArrayList();
}
Collection<TimestampedDatasetVersion> copyableVersions = this.versionSelectionPolicy.listSelectedVersions(versions);
ConcurrentLinkedQueue<CopyableFile> copyableFileList = new ConcurrentLinkedQueue<>();
List<Future<?>> futures = Lists.newArrayList();
for (TimestampedDatasetVersion copyableVersion : copyableVersions) {
futures.add(this.executor.submit(this.getCopyableFileGenetator(targetFs, configuration, copyableVersion, copyableFileList)));
}
try {
for (Future<?> future : futures) {
future.get();
}
} catch (ExecutionException | InterruptedException e) {
throw new IOException("Failed to generate copyable files.", e);
} finally {
ExecutorsUtils.shutdownExecutorService(executor, Optional.of(log));
}
return copyableFileList;
}
use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.
the class FileAwareInputStreamDataWriter method writeImpl.
@Override
public final void writeImpl(FileAwareInputStream fileAwareInputStream) throws IOException {
CopyableFile copyableFile = fileAwareInputStream.getFile();
if (encryptionConfig != null) {
copyableFile.setDestination(PathUtils.addExtension(copyableFile.getDestination(), "." + EncryptionConfigParser.getEncryptionType(encryptionConfig)));
}
Path stagingFile = getStagingFilePath(copyableFile);
if (this.actualProcessedCopyableFile.isPresent()) {
throw new IOException(this.getClass().getCanonicalName() + " can only process one file.");
}
this.actualProcessedCopyableFile = Optional.of(copyableFile);
this.fs.mkdirs(stagingFile.getParent());
writeImpl(fileAwareInputStream.getInputStream(), stagingFile, copyableFile);
this.filesWritten.incrementAndGet();
}
Aggregations