use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.
the class CopyDataPublisher method getCommitSequence.
private static List<CommitStep> getCommitSequence(Collection<WorkUnitState> workUnits, Class<?> baseClass) throws IOException {
List<CommitStepCopyEntity> steps = Lists.newArrayList();
for (WorkUnitState wus : workUnits) {
if (baseClass.isAssignableFrom(CopySource.getCopyEntityClass(wus))) {
CommitStepCopyEntity step = (CommitStepCopyEntity) CopySource.deserializeCopyEntity(wus);
steps.add(step);
}
}
Comparator<CommitStepCopyEntity> commitStepSorter = new Comparator<CommitStepCopyEntity>() {
@Override
public int compare(CommitStepCopyEntity o1, CommitStepCopyEntity o2) {
return Integer.compare(o1.getPriority(), o2.getPriority());
}
};
Collections.sort(steps, commitStepSorter);
List<CommitStep> sequence = Lists.newArrayList();
for (CommitStepCopyEntity entity : steps) {
sequence.add(entity.getStep());
}
return sequence;
}
use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.
the class RecursiveCopyableDatasetTest method classifyFiles.
private ClassifiedFiles classifyFiles(Collection<? extends CopyEntity> copyEntities) {
Map<Path, Path> pathsToCopy = Maps.newHashMap();
Set<Path> pathsToDelete = Sets.newHashSet();
for (CopyEntity ce : copyEntities) {
if (ce instanceof CopyableFile) {
pathsToCopy.put(((CopyableFile) ce).getOrigin().getPath(), ((CopyableFile) ce).getDestination());
}
if (ce instanceof CommitStepCopyEntity) {
CommitStep step = ((CommitStepCopyEntity) ce).getStep();
if (step instanceof DeleteFileCommitStep) {
for (FileStatus status : ((DeleteFileCommitStep) step).getPathsToDelete()) {
pathsToDelete.add(status.getPath());
}
}
}
}
return new ClassifiedFiles(pathsToCopy, pathsToDelete);
}
use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.
the class CopyDataPublisher method publishFileSet.
/**
* Publish data for a {@link CopyableDataset}.
*/
private void publishFileSet(CopyEntity.DatasetAndPartition datasetAndPartition, Collection<WorkUnitState> datasetWorkUnitStates) throws IOException {
Map<String, String> additionalMetadata = Maps.newHashMap();
Preconditions.checkArgument(!datasetWorkUnitStates.isEmpty(), "publishFileSet received an empty collection work units. This is an error in code.");
CopyableDatasetMetadata metadata = CopyableDatasetMetadata.deserialize(datasetWorkUnitStates.iterator().next().getProp(CopySource.SERIALIZED_COPYABLE_DATASET));
Path datasetWriterOutputPath = new Path(this.writerOutputDir, datasetAndPartition.identifier());
log.info(String.format("[%s] Publishing fileSet from %s for dataset %s", datasetAndPartition.identifier(), datasetWriterOutputPath, metadata.getDatasetURN()));
List<CommitStep> prePublish = getCommitSequence(datasetWorkUnitStates, PrePublishStep.class);
List<CommitStep> postPublish = getCommitSequence(datasetWorkUnitStates, PostPublishStep.class);
log.info(String.format("[%s] Found %d prePublish steps and %d postPublish steps.", datasetAndPartition.identifier(), prePublish.size(), postPublish.size()));
executeCommitSequence(prePublish);
if (hasCopyableFiles(datasetWorkUnitStates)) {
// Targets are always absolute, so we start moving from root (will skip any existing directories).
HadoopUtils.renameRecursively(this.fs, datasetWriterOutputPath, new Path("/"));
} else {
log.info(String.format("[%s] No copyable files in dataset. Proceeding to postpublish steps.", datasetAndPartition.identifier()));
}
executeCommitSequence(postPublish);
this.fs.delete(datasetWriterOutputPath, true);
long datasetOriginTimestamp = Long.MAX_VALUE;
long datasetUpstreamTimestamp = Long.MAX_VALUE;
Optional<String> fileSetRoot = Optional.<String>absent();
for (WorkUnitState wus : datasetWorkUnitStates) {
if (wus.getWorkingState() == WorkingState.SUCCESSFUL) {
wus.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
}
CopyEntity copyEntity = CopySource.deserializeCopyEntity(wus);
if (copyEntity instanceof CopyableFile) {
CopyableFile copyableFile = (CopyableFile) copyEntity;
if (wus.getWorkingState() == WorkingState.COMMITTED) {
CopyEventSubmitterHelper.submitSuccessfulFilePublish(this.eventSubmitter, copyableFile, wus);
// Currently datasetOutputPath is only present for hive datasets.
if (!fileSetRoot.isPresent() && copyableFile.getDatasetOutputPath() != null) {
fileSetRoot = Optional.of(copyableFile.getDatasetOutputPath());
}
if (lineageInfo.isPresent()) {
lineageInfo.get().putDestination(copyableFile.getDestinationDataset(), 0, wus);
}
}
if (datasetOriginTimestamp > copyableFile.getOriginTimestamp()) {
datasetOriginTimestamp = copyableFile.getOriginTimestamp();
}
if (datasetUpstreamTimestamp > copyableFile.getUpstreamTimestamp()) {
datasetUpstreamTimestamp = copyableFile.getUpstreamTimestamp();
}
}
}
// something more readable
if (Long.MAX_VALUE == datasetOriginTimestamp) {
datasetOriginTimestamp = 0;
}
if (Long.MAX_VALUE == datasetUpstreamTimestamp) {
datasetUpstreamTimestamp = 0;
}
additionalMetadata.put(SlaEventKeys.SOURCE_URI, this.state.getProp(SlaEventKeys.SOURCE_URI));
additionalMetadata.put(SlaEventKeys.DESTINATION_URI, this.state.getProp(SlaEventKeys.DESTINATION_URI));
additionalMetadata.put(SlaEventKeys.DATASET_OUTPUT_PATH, fileSetRoot.or("Unknown"));
CopyEventSubmitterHelper.submitSuccessfulDatasetPublish(this.eventSubmitter, datasetAndPartition, Long.toString(datasetOriginTimestamp), Long.toString(datasetUpstreamTimestamp), additionalMetadata);
}
use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.
the class RecursiveCopyableDataset method getCopyableFiles.
@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration configuration) throws IOException {
Path nonGlobSearchPath = PathUtils.deepestNonGlobPath(this.glob);
Path targetPath = new Path(configuration.getPublishDir(), PathUtils.relativizePath(this.rootPath, nonGlobSearchPath));
Map<Path, FileStatus> filesInSource = createPathMap(getFilesAtPath(this.fs, this.rootPath, this.pathFilter), this.rootPath);
Map<Path, FileStatus> filesInTarget = createPathMap(getFilesAtPath(targetFs, targetPath, this.pathFilter), targetPath);
List<Path> toCopy = Lists.newArrayList();
Map<Path, FileStatus> toDelete = Maps.newHashMap();
boolean requiresUpdate = false;
for (Map.Entry<Path, FileStatus> entry : filesInSource.entrySet()) {
FileStatus statusInTarget = filesInTarget.remove(entry.getKey());
if (statusInTarget != null) {
// in both
if (!sameFile(filesInSource.get(entry.getKey()), statusInTarget)) {
toCopy.add(entry.getKey());
toDelete.put(entry.getKey(), statusInTarget);
requiresUpdate = true;
}
} else {
toCopy.add(entry.getKey());
}
}
if (!this.update && requiresUpdate) {
throw new IOException("Some files need to be copied but they already exist in the destination. " + "Aborting because not running in update mode.");
}
if (this.delete) {
toDelete.putAll(filesInTarget);
}
List<CopyEntity> copyEntities = Lists.newArrayList();
List<CopyableFile> copyableFiles = Lists.newArrayList();
for (Path path : toCopy) {
FileStatus file = filesInSource.get(path);
Path filePathRelativeToSearchPath = PathUtils.relativizePath(file.getPath(), nonGlobSearchPath);
Path thisTargetPath = new Path(configuration.getPublishDir(), filePathRelativeToSearchPath);
CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(this.fs, file, thisTargetPath, configuration).fileSet(datasetURN()).datasetOutputPath(thisTargetPath.toString()).ancestorsOwnerAndPermission(CopyableFile.resolveReplicatedOwnerAndPermissionsRecursively(this.fs, file.getPath().getParent(), nonGlobSearchPath, configuration)).build();
copyableFile.setFsDatasets(this.fs, targetFs);
copyableFiles.add(copyableFile);
}
copyEntities.addAll(this.copyableFileFilter.filter(this.fs, targetFs, copyableFiles));
if (!toDelete.isEmpty()) {
CommitStep step = new DeleteFileCommitStep(targetFs, toDelete.values(), this.properties, this.deleteEmptyDirectories ? Optional.of(targetPath) : Optional.<Path>absent());
copyEntities.add(new PrePublishStep(datasetURN(), Maps.<String, String>newHashMap(), step, 1));
}
return copyEntities;
}
use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.
the class GobblinMultiTaskAttempt method commit.
/**
* Commit {@link #tasks} by 1. calling {@link Task#commit()} in parallel; 2. executing any additional {@link CommitStep};
* 3. persist task statestore.
* @throws IOException
*/
public void commit() throws IOException {
if (this.tasks == null || this.tasks.isEmpty()) {
log.warn("No tasks to be committed in container " + containerIdOptional.or(""));
return;
}
Iterator<Callable<Void>> callableIterator = Iterators.transform(this.tasks.iterator(), new Function<Task, Callable<Void>>() {
@Override
public Callable<Void> apply(final Task task) {
return new Callable<Void>() {
@Nullable
@Override
public Void call() throws Exception {
task.commit();
return null;
}
};
}
});
try {
List<Either<Void, ExecutionException>> executionResults = new IteratorExecutor<>(callableIterator, this.getTaskCommitThreadPoolSize(), ExecutorsUtils.newDaemonThreadFactory(Optional.of(log), Optional.of("Task-committing-pool-%d"))).executeAndGetResults();
IteratorExecutor.logFailures(executionResults, log, 10);
} catch (InterruptedException ie) {
log.error("Committing of tasks interrupted. Aborting.");
throw new RuntimeException(ie);
} finally {
persistTaskStateStore();
if (this.cleanupCommitSteps != null) {
for (CommitStep cleanupCommitStep : this.cleanupCommitSteps) {
log.info("Executing additional commit step.");
cleanupCommitStep.execute();
}
}
}
}
Aggregations