Search in sources :

Example 6 with CopyableDatasetMetadata

use of org.apache.gobblin.data.management.copy.CopyableDatasetMetadata in project incubator-gobblin by apache.

the class CopyDataPublisher method persistFailedFileSet.

private int persistFailedFileSet(Collection<? extends WorkUnitState> workUnitStates) throws IOException {
    int filesPersisted = 0;
    for (WorkUnitState wu : workUnitStates) {
        if (wu.getWorkingState() == WorkingState.SUCCESSFUL) {
            CopyEntity entity = CopySource.deserializeCopyEntity(wu);
            if (entity instanceof CopyableFile) {
                CopyableFile file = (CopyableFile) entity;
                Path outputDir = FileAwareInputStreamDataWriter.getOutputDir(wu);
                CopyableDatasetMetadata metadata = CopySource.deserializeCopyableDataset(wu);
                Path outputPath = FileAwareInputStreamDataWriter.getOutputFilePath(file, outputDir, file.getDatasetAndPartition(metadata));
                if (this.recoveryHelper.persistFile(wu, file, outputPath)) {
                    filesPersisted++;
                }
            }
        }
    }
    return filesPersisted;
}
Also used : Path(org.apache.hadoop.fs.Path) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile)

Example 7 with CopyableDatasetMetadata

use of org.apache.gobblin.data.management.copy.CopyableDatasetMetadata in project incubator-gobblin by apache.

the class CopyDataPublisher method publishFileSet.

/**
 * Publish data for a {@link CopyableDataset}.
 */
private void publishFileSet(CopyEntity.DatasetAndPartition datasetAndPartition, Collection<WorkUnitState> datasetWorkUnitStates) throws IOException {
    Map<String, String> additionalMetadata = Maps.newHashMap();
    Preconditions.checkArgument(!datasetWorkUnitStates.isEmpty(), "publishFileSet received an empty collection work units. This is an error in code.");
    CopyableDatasetMetadata metadata = CopyableDatasetMetadata.deserialize(datasetWorkUnitStates.iterator().next().getProp(CopySource.SERIALIZED_COPYABLE_DATASET));
    Path datasetWriterOutputPath = new Path(this.writerOutputDir, datasetAndPartition.identifier());
    log.info(String.format("[%s] Publishing fileSet from %s for dataset %s", datasetAndPartition.identifier(), datasetWriterOutputPath, metadata.getDatasetURN()));
    List<CommitStep> prePublish = getCommitSequence(datasetWorkUnitStates, PrePublishStep.class);
    List<CommitStep> postPublish = getCommitSequence(datasetWorkUnitStates, PostPublishStep.class);
    log.info(String.format("[%s] Found %d prePublish steps and %d postPublish steps.", datasetAndPartition.identifier(), prePublish.size(), postPublish.size()));
    executeCommitSequence(prePublish);
    if (hasCopyableFiles(datasetWorkUnitStates)) {
        // Targets are always absolute, so we start moving from root (will skip any existing directories).
        HadoopUtils.renameRecursively(this.fs, datasetWriterOutputPath, new Path("/"));
    } else {
        log.info(String.format("[%s] No copyable files in dataset. Proceeding to postpublish steps.", datasetAndPartition.identifier()));
    }
    executeCommitSequence(postPublish);
    this.fs.delete(datasetWriterOutputPath, true);
    long datasetOriginTimestamp = Long.MAX_VALUE;
    long datasetUpstreamTimestamp = Long.MAX_VALUE;
    Optional<String> fileSetRoot = Optional.<String>absent();
    for (WorkUnitState wus : datasetWorkUnitStates) {
        if (wus.getWorkingState() == WorkingState.SUCCESSFUL) {
            wus.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
        }
        CopyEntity copyEntity = CopySource.deserializeCopyEntity(wus);
        if (copyEntity instanceof CopyableFile) {
            CopyableFile copyableFile = (CopyableFile) copyEntity;
            if (wus.getWorkingState() == WorkingState.COMMITTED) {
                CopyEventSubmitterHelper.submitSuccessfulFilePublish(this.eventSubmitter, copyableFile, wus);
                // Currently datasetOutputPath is only present for hive datasets.
                if (!fileSetRoot.isPresent() && copyableFile.getDatasetOutputPath() != null) {
                    fileSetRoot = Optional.of(copyableFile.getDatasetOutputPath());
                }
                if (lineageInfo.isPresent()) {
                    lineageInfo.get().putDestination(copyableFile.getDestinationDataset(), 0, wus);
                }
            }
            if (datasetOriginTimestamp > copyableFile.getOriginTimestamp()) {
                datasetOriginTimestamp = copyableFile.getOriginTimestamp();
            }
            if (datasetUpstreamTimestamp > copyableFile.getUpstreamTimestamp()) {
                datasetUpstreamTimestamp = copyableFile.getUpstreamTimestamp();
            }
        }
    }
    // something more readable
    if (Long.MAX_VALUE == datasetOriginTimestamp) {
        datasetOriginTimestamp = 0;
    }
    if (Long.MAX_VALUE == datasetUpstreamTimestamp) {
        datasetUpstreamTimestamp = 0;
    }
    additionalMetadata.put(SlaEventKeys.SOURCE_URI, this.state.getProp(SlaEventKeys.SOURCE_URI));
    additionalMetadata.put(SlaEventKeys.DESTINATION_URI, this.state.getProp(SlaEventKeys.DESTINATION_URI));
    additionalMetadata.put(SlaEventKeys.DATASET_OUTPUT_PATH, fileSetRoot.or("Unknown"));
    CopyEventSubmitterHelper.submitSuccessfulDatasetPublish(this.eventSubmitter, datasetAndPartition, Long.toString(datasetOriginTimestamp), Long.toString(datasetUpstreamTimestamp), additionalMetadata);
}
Also used : Path(org.apache.hadoop.fs.Path) CommitStep(org.apache.gobblin.commit.CommitStep) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile)

Aggregations

WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)7 CopyableDatasetMetadata (org.apache.gobblin.data.management.copy.CopyableDatasetMetadata)7 Path (org.apache.hadoop.fs.Path)7 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)5 TestCopyableDataset (org.apache.gobblin.data.management.copy.TestCopyableDataset)5 Test (org.testng.annotations.Test)5 FileInputStream (java.io.FileInputStream)3 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)3 FileAwareInputStream (org.apache.gobblin.data.management.copy.FileAwareInputStream)3 OwnerAndPermission (org.apache.gobblin.data.management.copy.OwnerAndPermission)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 FsPermission (org.apache.hadoop.fs.permission.FsPermission)3 File (java.io.File)2 CommitStepCopyEntity (org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 Properties (java.util.Properties)1 CommitStep (org.apache.gobblin.commit.CommitStep)1 State (org.apache.gobblin.configuration.State)1 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)1 CopyConfiguration (org.apache.gobblin.data.management.copy.CopyConfiguration)1