Search in sources :

Example 16 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class ConfigBasedDataset method getCopyableFiles.

@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration copyConfiguration) throws IOException {
    List<CopyEntity> copyableFiles = Lists.newArrayList();
    EndPoint copyFromRaw = copyRoute.getCopyFrom();
    EndPoint copyToRaw = copyRoute.getCopyTo();
    if (!(copyFromRaw instanceof HadoopFsEndPoint && copyToRaw instanceof HadoopFsEndPoint)) {
        log.warn("Currently only handle the Hadoop Fs EndPoint replication");
        return copyableFiles;
    }
    // For {@link HadoopFsEndPoint}s, set pathfilter and applyFilterToDirectories
    HadoopFsEndPoint copyFrom = (HadoopFsEndPoint) copyFromRaw;
    HadoopFsEndPoint copyTo = (HadoopFsEndPoint) copyToRaw;
    copyFrom.setPathFilter(pathFilter);
    copyFrom.setApplyFilterToDirectories(applyFilterToDirectories);
    copyTo.setPathFilter(pathFilter);
    copyTo.setApplyFilterToDirectories(applyFilterToDirectories);
    if (this.watermarkEnabled) {
        if ((!copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent()) || (copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent() && copyFromRaw.getWatermark().get().compareTo(copyToRaw.getWatermark().get()) <= 0)) {
            log.info("No need to copy as destination watermark >= source watermark with source watermark {}, for dataset with metadata {}", copyFromRaw.getWatermark().isPresent() ? copyFromRaw.getWatermark().get().toJson() : "N/A", this.rc.getMetaData());
            return copyableFiles;
        }
    }
    Configuration conf = HadoopUtils.newConfiguration();
    FileSystem copyFromFs = FileSystem.get(copyFrom.getFsURI(), conf);
    FileSystem copyToFs = FileSystem.get(copyTo.getFsURI(), conf);
    Collection<FileStatus> allFilesInSource = copyFrom.getFiles();
    Collection<FileStatus> allFilesInTarget = copyTo.getFiles();
    Set<FileStatus> copyFromFileStatuses = Sets.newHashSet(allFilesInSource);
    Map<Path, FileStatus> copyToFileMap = Maps.newHashMap();
    for (FileStatus f : allFilesInTarget) {
        copyToFileMap.put(PathUtils.getPathWithoutSchemeAndAuthority(f.getPath()), f);
    }
    Collection<Path> deletedPaths = Lists.newArrayList();
    boolean watermarkMetadataCopied = false;
    boolean deleteTargetIfNotExistOnSource = rc.isDeleteTargetIfNotExistOnSource();
    for (FileStatus originFileStatus : copyFromFileStatuses) {
        Path relative = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(originFileStatus.getPath()), PathUtils.getPathWithoutSchemeAndAuthority(copyFrom.getDatasetPath()));
        // construct the new path in the target file system
        Path newPath = new Path(copyTo.getDatasetPath(), relative);
        if (relative.toString().equals(ReplicaHadoopFsEndPoint.WATERMARK_FILE)) {
            watermarkMetadataCopied = true;
        }
        // skip copy same file
        if (copyToFileMap.containsKey(newPath) && copyToFileMap.get(newPath).getLen() == originFileStatus.getLen() && copyToFileMap.get(newPath).getModificationTime() > originFileStatus.getModificationTime()) {
            log.debug("Copy from timestamp older than copy to timestamp, skipped copy {} for dataset with metadata {}", originFileStatus.getPath(), this.rc.getMetaData());
        } else {
            // need to remove those files in the target File System
            if (copyToFileMap.containsKey(newPath)) {
                deletedPaths.add(newPath);
            }
            CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration).fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()).build();
            copyableFile.setFsDatasets(copyFromFs, copyToFs);
            copyableFiles.add(copyableFile);
        }
        // clean up already checked paths
        copyToFileMap.remove(newPath);
    }
    // delete the paths on target directory if NOT exists on source
    if (deleteTargetIfNotExistOnSource) {
        deletedPaths.addAll(copyToFileMap.keySet());
    }
    // delete old files first
    if (!deletedPaths.isEmpty()) {
        DeleteFileCommitStep deleteCommitStep = DeleteFileCommitStep.fromPaths(copyToFs, deletedPaths, this.props);
        copyableFiles.add(new PrePublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), deleteCommitStep, 0));
    }
    // generate the watermark file even if watermark checking is disabled. Make sure it can come into functional once disired.
    if ((!watermarkMetadataCopied) && copyFrom.getWatermark().isPresent()) {
        copyableFiles.add(new PostPublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), new WatermarkMetadataGenerationCommitStep(copyTo.getFsURI().toString(), copyTo.getDatasetPath(), copyFrom.getWatermark().get()), 1));
    }
    return copyableFiles;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) FileSystem(org.apache.hadoop.fs.FileSystem) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep)

Example 17 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class CopyDataPublisher method persistFailedFileSet.

private int persistFailedFileSet(Collection<? extends WorkUnitState> workUnitStates) throws IOException {
    int filesPersisted = 0;
    for (WorkUnitState wu : workUnitStates) {
        if (wu.getWorkingState() == WorkingState.SUCCESSFUL) {
            CopyEntity entity = CopySource.deserializeCopyEntity(wu);
            if (entity instanceof CopyableFile) {
                CopyableFile file = (CopyableFile) entity;
                Path outputDir = FileAwareInputStreamDataWriter.getOutputDir(wu);
                CopyableDatasetMetadata metadata = CopySource.deserializeCopyableDataset(wu);
                Path outputPath = FileAwareInputStreamDataWriter.getOutputFilePath(file, outputDir, file.getDatasetAndPartition(metadata));
                if (this.recoveryHelper.persistFile(wu, file, outputPath)) {
                    filesPersisted++;
                }
            }
        }
    }
    return filesPersisted;
}
Also used : Path(org.apache.hadoop.fs.Path) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile)

Example 18 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class CopyDataPublisher method publishFileSet.

/**
 * Publish data for a {@link CopyableDataset}.
 */
private void publishFileSet(CopyEntity.DatasetAndPartition datasetAndPartition, Collection<WorkUnitState> datasetWorkUnitStates) throws IOException {
    Map<String, String> additionalMetadata = Maps.newHashMap();
    Preconditions.checkArgument(!datasetWorkUnitStates.isEmpty(), "publishFileSet received an empty collection work units. This is an error in code.");
    CopyableDatasetMetadata metadata = CopyableDatasetMetadata.deserialize(datasetWorkUnitStates.iterator().next().getProp(CopySource.SERIALIZED_COPYABLE_DATASET));
    Path datasetWriterOutputPath = new Path(this.writerOutputDir, datasetAndPartition.identifier());
    log.info(String.format("[%s] Publishing fileSet from %s for dataset %s", datasetAndPartition.identifier(), datasetWriterOutputPath, metadata.getDatasetURN()));
    List<CommitStep> prePublish = getCommitSequence(datasetWorkUnitStates, PrePublishStep.class);
    List<CommitStep> postPublish = getCommitSequence(datasetWorkUnitStates, PostPublishStep.class);
    log.info(String.format("[%s] Found %d prePublish steps and %d postPublish steps.", datasetAndPartition.identifier(), prePublish.size(), postPublish.size()));
    executeCommitSequence(prePublish);
    if (hasCopyableFiles(datasetWorkUnitStates)) {
        // Targets are always absolute, so we start moving from root (will skip any existing directories).
        HadoopUtils.renameRecursively(this.fs, datasetWriterOutputPath, new Path("/"));
    } else {
        log.info(String.format("[%s] No copyable files in dataset. Proceeding to postpublish steps.", datasetAndPartition.identifier()));
    }
    executeCommitSequence(postPublish);
    this.fs.delete(datasetWriterOutputPath, true);
    long datasetOriginTimestamp = Long.MAX_VALUE;
    long datasetUpstreamTimestamp = Long.MAX_VALUE;
    Optional<String> fileSetRoot = Optional.<String>absent();
    for (WorkUnitState wus : datasetWorkUnitStates) {
        if (wus.getWorkingState() == WorkingState.SUCCESSFUL) {
            wus.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
        }
        CopyEntity copyEntity = CopySource.deserializeCopyEntity(wus);
        if (copyEntity instanceof CopyableFile) {
            CopyableFile copyableFile = (CopyableFile) copyEntity;
            if (wus.getWorkingState() == WorkingState.COMMITTED) {
                CopyEventSubmitterHelper.submitSuccessfulFilePublish(this.eventSubmitter, copyableFile, wus);
                // Currently datasetOutputPath is only present for hive datasets.
                if (!fileSetRoot.isPresent() && copyableFile.getDatasetOutputPath() != null) {
                    fileSetRoot = Optional.of(copyableFile.getDatasetOutputPath());
                }
                if (lineageInfo.isPresent()) {
                    lineageInfo.get().putDestination(copyableFile.getDestinationDataset(), 0, wus);
                }
            }
            if (datasetOriginTimestamp > copyableFile.getOriginTimestamp()) {
                datasetOriginTimestamp = copyableFile.getOriginTimestamp();
            }
            if (datasetUpstreamTimestamp > copyableFile.getUpstreamTimestamp()) {
                datasetUpstreamTimestamp = copyableFile.getUpstreamTimestamp();
            }
        }
    }
    // something more readable
    if (Long.MAX_VALUE == datasetOriginTimestamp) {
        datasetOriginTimestamp = 0;
    }
    if (Long.MAX_VALUE == datasetUpstreamTimestamp) {
        datasetUpstreamTimestamp = 0;
    }
    additionalMetadata.put(SlaEventKeys.SOURCE_URI, this.state.getProp(SlaEventKeys.SOURCE_URI));
    additionalMetadata.put(SlaEventKeys.DESTINATION_URI, this.state.getProp(SlaEventKeys.DESTINATION_URI));
    additionalMetadata.put(SlaEventKeys.DATASET_OUTPUT_PATH, fileSetRoot.or("Unknown"));
    CopyEventSubmitterHelper.submitSuccessfulDatasetPublish(this.eventSubmitter, datasetAndPartition, Long.toString(datasetOriginTimestamp), Long.toString(datasetUpstreamTimestamp), additionalMetadata);
}
Also used : Path(org.apache.hadoop.fs.Path) CommitStep(org.apache.gobblin.commit.CommitStep) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) CopyableDatasetMetadata(org.apache.gobblin.data.management.copy.CopyableDatasetMetadata) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile)

Example 19 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class TimestampBasedCopyableDataset method getCopyableFiles.

@Override
public Collection<CopyableFile> getCopyableFiles(FileSystem targetFs, CopyConfiguration configuration) throws IOException {
    log.info(String.format("Getting copyable files at root path: %s", this.datasetRoot));
    List<TimestampedDatasetVersion> versions = Lists.newArrayList(this.datasetVersionFinder.findDatasetVersions(this));
    if (versions.isEmpty()) {
        log.warn("No dataset version can be found. Ignoring.");
        return Lists.newArrayList();
    }
    Collection<TimestampedDatasetVersion> copyableVersions = this.versionSelectionPolicy.listSelectedVersions(versions);
    ConcurrentLinkedQueue<CopyableFile> copyableFileList = new ConcurrentLinkedQueue<>();
    List<Future<?>> futures = Lists.newArrayList();
    for (TimestampedDatasetVersion copyableVersion : copyableVersions) {
        futures.add(this.executor.submit(this.getCopyableFileGenetator(targetFs, configuration, copyableVersion, copyableFileList)));
    }
    try {
        for (Future<?> future : futures) {
            future.get();
        }
    } catch (ExecutionException | InterruptedException e) {
        throw new IOException("Failed to generate copyable files.", e);
    } finally {
        ExecutorsUtils.shutdownExecutorService(executor, Optional.of(log));
    }
    return copyableFileList;
}
Also used : TimestampedDatasetVersion(org.apache.gobblin.data.management.version.TimestampedDatasetVersion) IOException(java.io.IOException) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) Future(java.util.concurrent.Future) ConcurrentLinkedQueue(java.util.concurrent.ConcurrentLinkedQueue) ExecutionException(java.util.concurrent.ExecutionException)

Example 20 with CopyableFile

use of org.apache.gobblin.data.management.copy.CopyableFile in project incubator-gobblin by apache.

the class FileAwareInputStreamDataWriter method writeImpl.

@Override
public final void writeImpl(FileAwareInputStream fileAwareInputStream) throws IOException {
    CopyableFile copyableFile = fileAwareInputStream.getFile();
    if (encryptionConfig != null) {
        copyableFile.setDestination(PathUtils.addExtension(copyableFile.getDestination(), "." + EncryptionConfigParser.getEncryptionType(encryptionConfig)));
    }
    Path stagingFile = getStagingFilePath(copyableFile);
    if (this.actualProcessedCopyableFile.isPresent()) {
        throw new IOException(this.getClass().getCanonicalName() + " can only process one file.");
    }
    this.actualProcessedCopyableFile = Optional.of(copyableFile);
    this.fs.mkdirs(stagingFile.getParent());
    writeImpl(fileAwareInputStream.getInputStream(), stagingFile, copyableFile);
    this.filesWritten.incrementAndGet();
}
Also used : Path(org.apache.hadoop.fs.Path) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) IOException(java.io.IOException)

Aggregations

CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)20 Path (org.apache.hadoop.fs.Path)15 Test (org.testng.annotations.Test)9 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)8 FileStatus (org.apache.hadoop.fs.FileStatus)7 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)6 CopyConfiguration (org.apache.gobblin.data.management.copy.CopyConfiguration)6 CopyableDatasetMetadata (org.apache.gobblin.data.management.copy.CopyableDatasetMetadata)5 OwnerAndPermission (org.apache.gobblin.data.management.copy.OwnerAndPermission)5 Configuration (org.apache.hadoop.conf.Configuration)5 IOException (java.io.IOException)4 Properties (java.util.Properties)4 FileAwareInputStream (org.apache.gobblin.data.management.copy.FileAwareInputStream)4 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)4 DeleteFileCommitStep (org.apache.gobblin.util.commit.DeleteFileCommitStep)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 FsPermission (org.apache.hadoop.fs.permission.FsPermission)4 FileInputStream (java.io.FileInputStream)3 ConcurrentLinkedQueue (java.util.concurrent.ConcurrentLinkedQueue)3 TestCopyableDataset (org.apache.gobblin.data.management.copy.TestCopyableDataset)3