Search in sources :

Example 1 with CommitStep

use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.

the class CopyDataPublisher method getCommitSequence.

private static List<CommitStep> getCommitSequence(Collection<WorkUnitState> workUnits, Class<?> baseClass) throws IOException {
    List<CommitStepCopyEntity> steps = Lists.newArrayList();
    for (WorkUnitState wus : workUnits) {
        if (baseClass.isAssignableFrom(CopySource.getCopyEntityClass(wus))) {
            CommitStepCopyEntity step = (CommitStepCopyEntity) CopySource.deserializeCopyEntity(wus);
            steps.add(step);
        }
    }
    Comparator<CommitStepCopyEntity> commitStepSorter = new Comparator<CommitStepCopyEntity>() {

        @Override
        public int compare(CommitStepCopyEntity o1, CommitStepCopyEntity o2) {
            return Integer.compare(o1.getPriority(), o2.getPriority());
        }
    };
    Collections.sort(steps, commitStepSorter);
    List<CommitStep> sequence = Lists.newArrayList();
    for (CommitStepCopyEntity entity : steps) {
        sequence.add(entity.getStep());
    }
    return sequence;
}
Also used : CommitStep(org.apache.gobblin.commit.CommitStep) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) Comparator(java.util.Comparator)

Example 2 with CommitStep

use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.

the class RecursiveCopyableDatasetTest method classifyFiles.

private ClassifiedFiles classifyFiles(Collection<? extends CopyEntity> copyEntities) {
    Map<Path, Path> pathsToCopy = Maps.newHashMap();
    Set<Path> pathsToDelete = Sets.newHashSet();
    for (CopyEntity ce : copyEntities) {
        if (ce instanceof CopyableFile) {
            pathsToCopy.put(((CopyableFile) ce).getOrigin().getPath(), ((CopyableFile) ce).getDestination());
        }
        if (ce instanceof CommitStepCopyEntity) {
            CommitStep step = ((CommitStepCopyEntity) ce).getStep();
            if (step instanceof DeleteFileCommitStep) {
                for (FileStatus status : ((DeleteFileCommitStep) step).getPathsToDelete()) {
                    pathsToDelete.add(status.getPath());
                }
            }
        }
    }
    return new ClassifiedFiles(pathsToCopy, pathsToDelete);
}
Also used : Path(org.apache.hadoop.fs.Path) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) CommitStep(org.apache.gobblin.commit.CommitStep) FileStatus(org.apache.hadoop.fs.FileStatus) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity)

Example 3 with CommitStep

use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.

the class CopyDataPublisher method publishFileSet.

/**
 * Publish data for a {@link CopyableDataset}.
 */
private void publishFileSet(CopyEntity.DatasetAndPartition datasetAndPartition, Collection<WorkUnitState> datasetWorkUnitStates) throws IOException {
    Map<String, String> additionalMetadata = Maps.newHashMap();
    Preconditions.checkArgument(!datasetWorkUnitStates.isEmpty(), "publishFileSet received an empty collection work units. This is an error in code.");
    CopyableDatasetMetadata metadata = CopyableDatasetMetadata.deserialize(datasetWorkUnitStates.iterator().next().getProp(CopySource.SERIALIZED_COPYABLE_DATASET));
    Path datasetWriterOutputPath = new Path(this.writerOutputDir, datasetAndPartition.identifier());
    log.info(String.format("[%s] Publishing fileSet from %s for dataset %s", datasetAndPartition.identifier(), datasetWriterOutputPath, metadata.getDatasetURN()));
    List<CommitStep> prePublish = getCommitSequence(datasetWorkUnitStates, PrePublishStep.class);
    List<CommitStep> postPublish = getCommitSequence(datasetWorkUnitStates, PostPublishStep.class);
    log.info(String.format("[%s] Found %d prePublish steps and %d postPublish steps.", datasetAndPartition.identifier(), prePublish.size(), postPublish.size()));
    executeCommitSequence(prePublish);
    if (hasCopyableFiles(datasetWorkUnitStates)) {
        // Targets are always absolute, so we start moving from root (will skip any existing directories).
        HadoopUtils.renameRecursively(this.fs, datasetWriterOutputPath, new Path("/"));
    } else {
        log.info(String.format("[%s] No copyable files in dataset. Proceeding to postpublish steps.", datasetAndPartition.identifier()));
    }
    executeCommitSequence(postPublish);
    this.fs.delete(datasetWriterOutputPath, true);
    long datasetOriginTimestamp = Long.MAX_VALUE;
    long datasetUpstreamTimestamp = Long.MAX_VALUE;
    Optional<String> fileSetRoot = Optional.<String>absent();
    for (WorkUnitState wus : datasetWorkUnitStates) {
        if (wus.getWorkingState() == WorkingState.SUCCESSFUL) {
            wus.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
        }
        CopyEntity copyEntity = CopySource.deserializeCopyEntity(wus);
        if (copyEntity instanceof CopyableFile) {
            CopyableFile copyableFile = (CopyableFile) copyEntity;
            if (wus.getWorkingState() == WorkingState.COMMITTED) {
                CopyEventSubmitterHelper.submitSuccessfulFilePublish(this.eventSubmitter, copyableFile, wus);
                // Currently datasetOutputPath is only present for hive datasets.
                if (!fileSetRoot.isPresent() && copyableFile.getDatasetOutputPath() != null) {
                    fileSetRoot = Optional.of(copyableFile.getDatasetOutputPath());
                }
                if (lineageInfo.isPresent()) {
                    lineageInfo.get().putDestination(copyableFile.getDestinationDataset(), 0, wus);
                }
            }
            if (datasetOriginTimestamp > copyableFile.getOriginTimestamp()) {
                datasetOriginTimestamp = copyableFile.getOriginTimestamp();
            }
            if (datasetUpstreamTimestamp > copyableFile.getUpstreamTimestamp()) {
                datasetUpstreamTimestamp = copyableFile.getUpstreamTimestamp();
            }
        }
    }
    // something more readable
    if (Long.MAX_VALUE == datasetOriginTimestamp) {
        datasetOriginTimestamp = 0;
    }
    if (Long.MAX_VALUE == datasetUpstreamTimestamp) {
        datasetUpstreamTimestamp = 0;
    }
    additionalMetadata.put(SlaEventKeys.SOURCE_URI, this.state.getProp(SlaEventKeys.SOURCE_URI));
    additionalMetadata.put(SlaEventKeys.DESTINATION_URI, this.state.getProp(SlaEventKeys.DESTINATION_URI));
    additionalMetadata.put(SlaEventKeys.DATASET_OUTPUT_PATH, fileSetRoot.or("Unknown"));
    CopyEventSubmitterHelper.submitSuccessfulDatasetPublish(this.eventSubmitter, datasetAndPartition, Long.toString(datasetOriginTimestamp), Long.toString(datasetUpstreamTimestamp), additionalMetadata);
}
Also used : Path(org.apache.hadoop.fs.Path) CommitStep(org.apache.gobblin.commit.CommitStep) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile)

Example 4 with CommitStep

use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.

the class RecursiveCopyableDataset method getCopyableFiles.

@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration configuration) throws IOException {
    Path nonGlobSearchPath = PathUtils.deepestNonGlobPath(this.glob);
    Path targetPath = new Path(configuration.getPublishDir(), PathUtils.relativizePath(this.rootPath, nonGlobSearchPath));
    Map<Path, FileStatus> filesInSource = createPathMap(getFilesAtPath(this.fs, this.rootPath, this.pathFilter), this.rootPath);
    Map<Path, FileStatus> filesInTarget = createPathMap(getFilesAtPath(targetFs, targetPath, this.pathFilter), targetPath);
    List<Path> toCopy = Lists.newArrayList();
    Map<Path, FileStatus> toDelete = Maps.newHashMap();
    boolean requiresUpdate = false;
    for (Map.Entry<Path, FileStatus> entry : filesInSource.entrySet()) {
        FileStatus statusInTarget = filesInTarget.remove(entry.getKey());
        if (statusInTarget != null) {
            // in both
            if (!sameFile(filesInSource.get(entry.getKey()), statusInTarget)) {
                toCopy.add(entry.getKey());
                toDelete.put(entry.getKey(), statusInTarget);
                requiresUpdate = true;
            }
        } else {
            toCopy.add(entry.getKey());
        }
    }
    if (!this.update && requiresUpdate) {
        throw new IOException("Some files need to be copied but they already exist in the destination. " + "Aborting because not running in update mode.");
    }
    if (this.delete) {
        toDelete.putAll(filesInTarget);
    }
    List<CopyEntity> copyEntities = Lists.newArrayList();
    List<CopyableFile> copyableFiles = Lists.newArrayList();
    for (Path path : toCopy) {
        FileStatus file = filesInSource.get(path);
        Path filePathRelativeToSearchPath = PathUtils.relativizePath(file.getPath(), nonGlobSearchPath);
        Path thisTargetPath = new Path(configuration.getPublishDir(), filePathRelativeToSearchPath);
        CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(this.fs, file, thisTargetPath, configuration).fileSet(datasetURN()).datasetOutputPath(thisTargetPath.toString()).ancestorsOwnerAndPermission(CopyableFile.resolveReplicatedOwnerAndPermissionsRecursively(this.fs, file.getPath().getParent(), nonGlobSearchPath, configuration)).build();
        copyableFile.setFsDatasets(this.fs, targetFs);
        copyableFiles.add(copyableFile);
    }
    copyEntities.addAll(this.copyableFileFilter.filter(this.fs, targetFs, copyableFiles));
    if (!toDelete.isEmpty()) {
        CommitStep step = new DeleteFileCommitStep(targetFs, toDelete.values(), this.properties, this.deleteEmptyDirectories ? Optional.of(targetPath) : Optional.<Path>absent());
        copyEntities.add(new PrePublishStep(datasetURN(), Maps.<String, String>newHashMap(), step, 1));
    }
    return copyEntities;
}
Also used : Path(org.apache.hadoop.fs.Path) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) CommitStep(org.apache.gobblin.commit.CommitStep) FileStatus(org.apache.hadoop.fs.FileStatus) IOException(java.io.IOException) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep) Map(java.util.Map)

Example 5 with CommitStep

use of org.apache.gobblin.commit.CommitStep in project incubator-gobblin by apache.

the class GobblinMultiTaskAttempt method commit.

/**
 * Commit {@link #tasks} by 1. calling {@link Task#commit()} in parallel; 2. executing any additional {@link CommitStep};
 * 3. persist task statestore.
 * @throws IOException
 */
public void commit() throws IOException {
    if (this.tasks == null || this.tasks.isEmpty()) {
        log.warn("No tasks to be committed in container " + containerIdOptional.or(""));
        return;
    }
    Iterator<Callable<Void>> callableIterator = Iterators.transform(this.tasks.iterator(), new Function<Task, Callable<Void>>() {

        @Override
        public Callable<Void> apply(final Task task) {
            return new Callable<Void>() {

                @Nullable
                @Override
                public Void call() throws Exception {
                    task.commit();
                    return null;
                }
            };
        }
    });
    try {
        List<Either<Void, ExecutionException>> executionResults = new IteratorExecutor<>(callableIterator, this.getTaskCommitThreadPoolSize(), ExecutorsUtils.newDaemonThreadFactory(Optional.of(log), Optional.of("Task-committing-pool-%d"))).executeAndGetResults();
        IteratorExecutor.logFailures(executionResults, log, 10);
    } catch (InterruptedException ie) {
        log.error("Committing of tasks interrupted. Aborting.");
        throw new RuntimeException(ie);
    } finally {
        persistTaskStateStore();
        if (this.cleanupCommitSteps != null) {
            for (CommitStep cleanupCommitStep : this.cleanupCommitSteps) {
                log.info("Executing additional commit step.");
                cleanupCommitStep.execute();
            }
        }
    }
}
Also used : CommitStep(org.apache.gobblin.commit.CommitStep) Callable(java.util.concurrent.Callable) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) Either(org.apache.gobblin.util.Either) Nullable(javax.annotation.Nullable)

Aggregations

CommitStep (org.apache.gobblin.commit.CommitStep)5 CommitStepCopyEntity (org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity)3 Path (org.apache.hadoop.fs.Path)3 IOException (java.io.IOException)2 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)2 DeleteFileCommitStep (org.apache.gobblin.util.commit.DeleteFileCommitStep)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 Comparator (java.util.Comparator)1 Map (java.util.Map)1 Callable (java.util.concurrent.Callable)1 ExecutionException (java.util.concurrent.ExecutionException)1 Nullable (javax.annotation.Nullable)1 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)1 CopyableDatasetMetadata (org.apache.gobblin.data.management.copy.CopyableDatasetMetadata)1 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)1 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)1 Either (org.apache.gobblin.util.Either)1