Search in sources :

Example 1 with StreamCopier

use of org.apache.gobblin.util.io.StreamCopier in project incubator-gobblin by apache.

the class FileAwareInputStreamDataWriter method writeImpl.

/**
 * Write the contents of input stream into staging path.
 *
 * <p>
 *   WriteAt indicates the path where the contents of the input stream should be written. When this method is called,
 *   the path writeAt.getParent() will exist already, but the path writeAt will not exist. When this method is returned,
 *   the path writeAt must exist. Any data written to any location other than writeAt or a descendant of writeAt
 *   will be ignored.
 * </p>
 *
 * @param inputStream {@link FSDataInputStream} whose contents should be written to staging path.
 * @param writeAt {@link Path} at which contents should be written.
 * @param copyableFile {@link org.apache.gobblin.data.management.copy.CopyEntity} that generated this copy operation.
 * @throws IOException
 */
protected void writeImpl(InputStream inputStream, Path writeAt, CopyableFile copyableFile) throws IOException {
    final short replication = copyableFile.getPreserve().preserve(PreserveAttributes.Option.REPLICATION) ? copyableFile.getOrigin().getReplication() : this.fs.getDefaultReplication(writeAt);
    final long blockSize = copyableFile.getPreserve().preserve(PreserveAttributes.Option.BLOCK_SIZE) ? copyableFile.getOrigin().getBlockSize() : this.fs.getDefaultBlockSize(writeAt);
    Predicate<FileStatus> fileStatusAttributesFilter = new Predicate<FileStatus>() {

        @Override
        public boolean apply(FileStatus input) {
            return input.getReplication() == replication && input.getBlockSize() == blockSize;
        }
    };
    Optional<FileStatus> persistedFile = this.recoveryHelper.findPersistedFile(this.state, copyableFile, fileStatusAttributesFilter);
    if (persistedFile.isPresent()) {
        log.info(String.format("Recovering persisted file %s to %s.", persistedFile.get().getPath(), writeAt));
        this.fs.rename(persistedFile.get().getPath(), writeAt);
    } else {
        // Copy empty directories
        if (copyableFile.getFileStatus().isDirectory()) {
            this.fs.mkdirs(writeAt);
            return;
        }
        OutputStream os = this.fs.create(writeAt, true, this.fs.getConf().getInt("io.file.buffer.size", 4096), replication, blockSize);
        if (encryptionConfig != null) {
            os = EncryptionFactory.buildStreamCryptoProvider(encryptionConfig).encodeOutputStream(os);
        }
        try {
            FileSystem defaultFS = FileSystem.get(new Configuration());
            StreamThrottler<GobblinScopeTypes> throttler = this.taskBroker.getSharedResource(new StreamThrottler.Factory<GobblinScopeTypes>(), new EmptyKey());
            ThrottledInputStream throttledInputStream = throttler.throttleInputStream().inputStream(inputStream).sourceURI(copyableFile.getOrigin().getPath().makeQualified(defaultFS.getUri(), defaultFS.getWorkingDirectory()).toUri()).targetURI(this.fs.makeQualified(writeAt).toUri()).build();
            StreamCopier copier = new StreamCopier(throttledInputStream, os).withBufferSize(this.bufferSize);
            log.info("File {}: Starting copy", copyableFile.getOrigin().getPath());
            if (isInstrumentationEnabled()) {
                copier.withCopySpeedMeter(this.copySpeedMeter);
            }
            long numBytes = copier.copy();
            long fileSize = copyableFile.getFileStatus().getLen();
            if (this.checkFileSize && numBytes != fileSize) {
                throw new IOException(String.format("Number of bytes copied doesn't match filesize for file %s.", copyableFile.getOrigin().getPath()));
            }
            this.bytesWritten.addAndGet(numBytes);
            if (isInstrumentationEnabled()) {
                log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate());
            } else {
                log.info("File {} copied.", copyableFile.getOrigin().getPath());
            }
        } catch (NotConfiguredException nce) {
            log.warn("Broker error. Some features of stream copier may not be available.", nce);
        } finally {
            os.close();
            inputStream.close();
        }
    }
}
Also used : NotConfiguredException(org.apache.gobblin.broker.iface.NotConfiguredException) FileStatus(org.apache.hadoop.fs.FileStatus) EmptyKey(org.apache.gobblin.broker.EmptyKey) Configuration(org.apache.hadoop.conf.Configuration) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) OutputStream(java.io.OutputStream) IOException(java.io.IOException) Predicate(com.google.common.base.Predicate) GobblinScopeTypes(org.apache.gobblin.broker.gobblin_scopes.GobblinScopeTypes) FileSystem(org.apache.hadoop.fs.FileSystem) ThrottledInputStream(org.apache.gobblin.util.io.ThrottledInputStream) StreamThrottler(org.apache.gobblin.util.io.StreamThrottler) StreamCopier(org.apache.gobblin.util.io.StreamCopier)

Example 2 with StreamCopier

use of org.apache.gobblin.util.io.StreamCopier in project incubator-gobblin by apache.

the class TarArchiveInputStreamDataWriter method writeImpl.

/**
 * Untars the passed in {@link FileAwareInputStream} to the task's staging directory. Uses the name of the root
 * {@link TarArchiveEntry} in the stream as the directory name for the untarred file. The method also commits the data
 * by moving the file from staging to output directory.
 *
 * @see org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter#write(org.apache.gobblin.data.management.copy.FileAwareInputStream)
 */
@Override
public void writeImpl(InputStream inputStream, Path writeAt, CopyableFile copyableFile) throws IOException {
    this.closer.register(inputStream);
    TarArchiveInputStream tarIn = new TarArchiveInputStream(inputStream);
    final ReadableByteChannel inputChannel = Channels.newChannel(tarIn);
    TarArchiveEntry tarEntry;
    // flush the first entry in the tar, which is just the root directory
    tarEntry = tarIn.getNextTarEntry();
    String tarEntryRootName = StringUtils.remove(tarEntry.getName(), Path.SEPARATOR);
    log.info("Unarchiving at " + writeAt);
    try {
        while ((tarEntry = tarIn.getNextTarEntry()) != null) {
            // the API tarEntry.getName() is misleading, it is actually the path of the tarEntry in the tar file
            String newTarEntryPath = tarEntry.getName().replace(tarEntryRootName, writeAt.getName());
            Path tarEntryStagingPath = new Path(writeAt.getParent(), newTarEntryPath);
            if (tarEntry.isDirectory() && !this.fs.exists(tarEntryStagingPath)) {
                this.fs.mkdirs(tarEntryStagingPath);
            } else if (!tarEntry.isDirectory()) {
                FSDataOutputStream out = this.fs.create(tarEntryStagingPath, true);
                final WritableByteChannel outputChannel = Channels.newChannel(out);
                try {
                    StreamCopier copier = new StreamCopier(inputChannel, outputChannel);
                    if (isInstrumentationEnabled()) {
                        copier.withCopySpeedMeter(this.copySpeedMeter);
                    }
                    this.bytesWritten.addAndGet(copier.copy());
                    if (isInstrumentationEnabled()) {
                        log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate());
                    } else {
                        log.info("File {} copied.", copyableFile.getOrigin().getPath());
                    }
                } finally {
                    out.close();
                    outputChannel.close();
                }
            }
        }
    } finally {
        tarIn.close();
        inputChannel.close();
        inputStream.close();
    }
}
Also used : TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) Path(org.apache.hadoop.fs.Path) ReadableByteChannel(java.nio.channels.ReadableByteChannel) WritableByteChannel(java.nio.channels.WritableByteChannel) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) StreamCopier(org.apache.gobblin.util.io.StreamCopier)

Aggregations

StreamCopier (org.apache.gobblin.util.io.StreamCopier)2 Predicate (com.google.common.base.Predicate)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1 ReadableByteChannel (java.nio.channels.ReadableByteChannel)1 WritableByteChannel (java.nio.channels.WritableByteChannel)1 TarArchiveEntry (org.apache.commons.compress.archivers.tar.TarArchiveEntry)1 TarArchiveInputStream (org.apache.commons.compress.archivers.tar.TarArchiveInputStream)1 EmptyKey (org.apache.gobblin.broker.EmptyKey)1 GobblinScopeTypes (org.apache.gobblin.broker.gobblin_scopes.GobblinScopeTypes)1 NotConfiguredException (org.apache.gobblin.broker.iface.NotConfiguredException)1 CopyConfiguration (org.apache.gobblin.data.management.copy.CopyConfiguration)1 StreamThrottler (org.apache.gobblin.util.io.StreamThrottler)1 ThrottledInputStream (org.apache.gobblin.util.io.ThrottledInputStream)1 Configuration (org.apache.hadoop.conf.Configuration)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1