Search in sources :

Example 11 with CopyEntity

use of org.apache.gobblin.data.management.copy.CopyEntity in project incubator-gobblin by apache.

the class CopyDataPublisher method publishFileSet.

/**
 * Publish data for a {@link CopyableDataset}.
 */
private void publishFileSet(CopyEntity.DatasetAndPartition datasetAndPartition, Collection<WorkUnitState> datasetWorkUnitStates) throws IOException {
    Map<String, String> additionalMetadata = Maps.newHashMap();
    Preconditions.checkArgument(!datasetWorkUnitStates.isEmpty(), "publishFileSet received an empty collection work units. This is an error in code.");
    CopyableDatasetMetadata metadata = CopyableDatasetMetadata.deserialize(datasetWorkUnitStates.iterator().next().getProp(CopySource.SERIALIZED_COPYABLE_DATASET));
    Path datasetWriterOutputPath = new Path(this.writerOutputDir, datasetAndPartition.identifier());
    log.info(String.format("[%s] Publishing fileSet from %s for dataset %s", datasetAndPartition.identifier(), datasetWriterOutputPath, metadata.getDatasetURN()));
    List<CommitStep> prePublish = getCommitSequence(datasetWorkUnitStates, PrePublishStep.class);
    List<CommitStep> postPublish = getCommitSequence(datasetWorkUnitStates, PostPublishStep.class);
    log.info(String.format("[%s] Found %d prePublish steps and %d postPublish steps.", datasetAndPartition.identifier(), prePublish.size(), postPublish.size()));
    executeCommitSequence(prePublish);
    if (hasCopyableFiles(datasetWorkUnitStates)) {
        // Targets are always absolute, so we start moving from root (will skip any existing directories).
        HadoopUtils.renameRecursively(this.fs, datasetWriterOutputPath, new Path("/"));
    } else {
        log.info(String.format("[%s] No copyable files in dataset. Proceeding to postpublish steps.", datasetAndPartition.identifier()));
    }
    executeCommitSequence(postPublish);
    this.fs.delete(datasetWriterOutputPath, true);
    long datasetOriginTimestamp = Long.MAX_VALUE;
    long datasetUpstreamTimestamp = Long.MAX_VALUE;
    Optional<String> fileSetRoot = Optional.<String>absent();
    for (WorkUnitState wus : datasetWorkUnitStates) {
        if (wus.getWorkingState() == WorkingState.SUCCESSFUL) {
            wus.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
        }
        CopyEntity copyEntity = CopySource.deserializeCopyEntity(wus);
        if (copyEntity instanceof CopyableFile) {
            CopyableFile copyableFile = (CopyableFile) copyEntity;
            if (wus.getWorkingState() == WorkingState.COMMITTED) {
                CopyEventSubmitterHelper.submitSuccessfulFilePublish(this.eventSubmitter, copyableFile, wus);
                // Currently datasetOutputPath is only present for hive datasets.
                if (!fileSetRoot.isPresent() && copyableFile.getDatasetOutputPath() != null) {
                    fileSetRoot = Optional.of(copyableFile.getDatasetOutputPath());
                }
                if (lineageInfo.isPresent()) {
                    lineageInfo.get().putDestination(copyableFile.getDestinationDataset(), 0, wus);
                }
            }
            if (datasetOriginTimestamp > copyableFile.getOriginTimestamp()) {
                datasetOriginTimestamp = copyableFile.getOriginTimestamp();
            }
            if (datasetUpstreamTimestamp > copyableFile.getUpstreamTimestamp()) {
                datasetUpstreamTimestamp = copyableFile.getUpstreamTimestamp();
            }
        }
    }
    // something more readable
    if (Long.MAX_VALUE == datasetOriginTimestamp) {
        datasetOriginTimestamp = 0;
    }
    if (Long.MAX_VALUE == datasetUpstreamTimestamp) {
        datasetUpstreamTimestamp = 0;
    }
    additionalMetadata.put(SlaEventKeys.SOURCE_URI, this.state.getProp(SlaEventKeys.SOURCE_URI));
    additionalMetadata.put(SlaEventKeys.DESTINATION_URI, this.state.getProp(SlaEventKeys.DESTINATION_URI));
    additionalMetadata.put(SlaEventKeys.DATASET_OUTPUT_PATH, fileSetRoot.or("Unknown"));
    CopyEventSubmitterHelper.submitSuccessfulDatasetPublish(this.eventSubmitter, datasetAndPartition, Long.toString(datasetOriginTimestamp), Long.toString(datasetUpstreamTimestamp), additionalMetadata);
}
Also used : Path(org.apache.hadoop.fs.Path) CommitStep(org.apache.gobblin.commit.CommitStep) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile)

Example 12 with CopyEntity

use of org.apache.gobblin.data.management.copy.CopyEntity in project incubator-gobblin by apache.

the class HivePartitionsDeregisterFileSet method generateCopyEntities.

@Override
protected Collection<CopyEntity> generateCopyEntities() throws IOException {
    List<CopyEntity> deregisterCopyEntities = Lists.newArrayList();
    int priority = 1;
    for (Partition partition : partitionsToDeregister) {
        try {
            priority = this.helper.addPartitionDeregisterSteps(deregisterCopyEntities, getName(), priority, this.helper.getTargetTable(), partition);
        } catch (IOException ioe) {
            log.error("Could not create work unit to deregister partition " + partition.getCompleteName());
        }
    }
    return deregisterCopyEntities;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) IOException(java.io.IOException)

Aggregations

CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)12 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)8 Path (org.apache.hadoop.fs.Path)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)4 PostPublishStep (org.apache.gobblin.data.management.copy.entities.PostPublishStep)4 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)4 DeleteFileCommitStep (org.apache.gobblin.util.commit.DeleteFileCommitStep)4 CopyableDatasetMetadata (org.apache.gobblin.data.management.copy.CopyableDatasetMetadata)3 CommitStepCopyEntity (org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity)3 Test (org.testng.annotations.Test)3 IOException (java.io.IOException)2 State (org.apache.gobblin.configuration.State)2 MultiTimingEvent (org.apache.gobblin.metrics.event.MultiTimingEvent)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 Table (org.apache.hadoop.hive.ql.metadata.Table)2 Closer (com.google.common.io.Closer)1 SuppressFBWarnings (edu.umd.cs.findbugs.annotations.SuppressFBWarnings)1 File (java.io.File)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1