use of org.apache.gobblin.data.management.copy.CopyableDatasetMetadata in project incubator-gobblin by apache.
the class CopyDataPublisher method persistFailedFileSet.
private int persistFailedFileSet(Collection<? extends WorkUnitState> workUnitStates) throws IOException {
int filesPersisted = 0;
for (WorkUnitState wu : workUnitStates) {
if (wu.getWorkingState() == WorkingState.SUCCESSFUL) {
CopyEntity entity = CopySource.deserializeCopyEntity(wu);
if (entity instanceof CopyableFile) {
CopyableFile file = (CopyableFile) entity;
Path outputDir = FileAwareInputStreamDataWriter.getOutputDir(wu);
CopyableDatasetMetadata metadata = CopySource.deserializeCopyableDataset(wu);
Path outputPath = FileAwareInputStreamDataWriter.getOutputFilePath(file, outputDir, file.getDatasetAndPartition(metadata));
if (this.recoveryHelper.persistFile(wu, file, outputPath)) {
filesPersisted++;
}
}
}
}
return filesPersisted;
}
use of org.apache.gobblin.data.management.copy.CopyableDatasetMetadata in project incubator-gobblin by apache.
the class CopyDataPublisher method publishFileSet.
/**
* Publish data for a {@link CopyableDataset}.
*/
private void publishFileSet(CopyEntity.DatasetAndPartition datasetAndPartition, Collection<WorkUnitState> datasetWorkUnitStates) throws IOException {
Map<String, String> additionalMetadata = Maps.newHashMap();
Preconditions.checkArgument(!datasetWorkUnitStates.isEmpty(), "publishFileSet received an empty collection work units. This is an error in code.");
CopyableDatasetMetadata metadata = CopyableDatasetMetadata.deserialize(datasetWorkUnitStates.iterator().next().getProp(CopySource.SERIALIZED_COPYABLE_DATASET));
Path datasetWriterOutputPath = new Path(this.writerOutputDir, datasetAndPartition.identifier());
log.info(String.format("[%s] Publishing fileSet from %s for dataset %s", datasetAndPartition.identifier(), datasetWriterOutputPath, metadata.getDatasetURN()));
List<CommitStep> prePublish = getCommitSequence(datasetWorkUnitStates, PrePublishStep.class);
List<CommitStep> postPublish = getCommitSequence(datasetWorkUnitStates, PostPublishStep.class);
log.info(String.format("[%s] Found %d prePublish steps and %d postPublish steps.", datasetAndPartition.identifier(), prePublish.size(), postPublish.size()));
executeCommitSequence(prePublish);
if (hasCopyableFiles(datasetWorkUnitStates)) {
// Targets are always absolute, so we start moving from root (will skip any existing directories).
HadoopUtils.renameRecursively(this.fs, datasetWriterOutputPath, new Path("/"));
} else {
log.info(String.format("[%s] No copyable files in dataset. Proceeding to postpublish steps.", datasetAndPartition.identifier()));
}
executeCommitSequence(postPublish);
this.fs.delete(datasetWriterOutputPath, true);
long datasetOriginTimestamp = Long.MAX_VALUE;
long datasetUpstreamTimestamp = Long.MAX_VALUE;
Optional<String> fileSetRoot = Optional.<String>absent();
for (WorkUnitState wus : datasetWorkUnitStates) {
if (wus.getWorkingState() == WorkingState.SUCCESSFUL) {
wus.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
}
CopyEntity copyEntity = CopySource.deserializeCopyEntity(wus);
if (copyEntity instanceof CopyableFile) {
CopyableFile copyableFile = (CopyableFile) copyEntity;
if (wus.getWorkingState() == WorkingState.COMMITTED) {
CopyEventSubmitterHelper.submitSuccessfulFilePublish(this.eventSubmitter, copyableFile, wus);
// Currently datasetOutputPath is only present for hive datasets.
if (!fileSetRoot.isPresent() && copyableFile.getDatasetOutputPath() != null) {
fileSetRoot = Optional.of(copyableFile.getDatasetOutputPath());
}
if (lineageInfo.isPresent()) {
lineageInfo.get().putDestination(copyableFile.getDestinationDataset(), 0, wus);
}
}
if (datasetOriginTimestamp > copyableFile.getOriginTimestamp()) {
datasetOriginTimestamp = copyableFile.getOriginTimestamp();
}
if (datasetUpstreamTimestamp > copyableFile.getUpstreamTimestamp()) {
datasetUpstreamTimestamp = copyableFile.getUpstreamTimestamp();
}
}
}
// something more readable
if (Long.MAX_VALUE == datasetOriginTimestamp) {
datasetOriginTimestamp = 0;
}
if (Long.MAX_VALUE == datasetUpstreamTimestamp) {
datasetUpstreamTimestamp = 0;
}
additionalMetadata.put(SlaEventKeys.SOURCE_URI, this.state.getProp(SlaEventKeys.SOURCE_URI));
additionalMetadata.put(SlaEventKeys.DESTINATION_URI, this.state.getProp(SlaEventKeys.DESTINATION_URI));
additionalMetadata.put(SlaEventKeys.DATASET_OUTPUT_PATH, fileSetRoot.or("Unknown"));
CopyEventSubmitterHelper.submitSuccessfulDatasetPublish(this.eventSubmitter, datasetAndPartition, Long.toString(datasetOriginTimestamp), Long.toString(datasetUpstreamTimestamp), additionalMetadata);
}
Aggregations