Search in sources :

Example 76 with InputStreamCallback

use of org.apache.nifi.processor.io.InputStreamCallback in project nifi by apache.

the class PutS3Object method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final long startNanos = System.nanoTime();
    final String bucket = context.getProperty(BUCKET).evaluateAttributeExpressions(flowFile).getValue();
    final String key = context.getProperty(KEY).evaluateAttributeExpressions(flowFile).getValue();
    final String cacheKey = getIdentifier() + "/" + bucket + "/" + key;
    final AmazonS3Client s3 = getClient();
    final FlowFile ff = flowFile;
    final Map<String, String> attributes = new HashMap<>();
    final String ffFilename = ff.getAttributes().get(CoreAttributes.FILENAME.key());
    attributes.put(S3_BUCKET_KEY, bucket);
    attributes.put(S3_OBJECT_KEY, key);
    final Long multipartThreshold = context.getProperty(MULTIPART_THRESHOLD).asDataSize(DataUnit.B).longValue();
    final Long multipartPartSize = context.getProperty(MULTIPART_PART_SIZE).asDataSize(DataUnit.B).longValue();
    final long now = System.currentTimeMillis();
    /*
         * If necessary, run age off for existing uploads in AWS S3 and local state
         */
    ageoffS3Uploads(context, s3, now);
    /*
         * Then
         */
    try {
        session.read(flowFile, new InputStreamCallback() {

            @Override
            public void process(final InputStream rawIn) throws IOException {
                try (final InputStream in = new BufferedInputStream(rawIn)) {
                    final ObjectMetadata objectMetadata = new ObjectMetadata();
                    objectMetadata.setContentDisposition(ff.getAttribute(CoreAttributes.FILENAME.key()));
                    objectMetadata.setContentLength(ff.getSize());
                    final String contentType = context.getProperty(CONTENT_TYPE).evaluateAttributeExpressions(ff).getValue();
                    if (contentType != null) {
                        objectMetadata.setContentType(contentType);
                        attributes.put(S3_CONTENT_TYPE, contentType);
                    }
                    final String expirationRule = context.getProperty(EXPIRATION_RULE_ID).evaluateAttributeExpressions(ff).getValue();
                    if (expirationRule != null) {
                        objectMetadata.setExpirationTimeRuleId(expirationRule);
                    }
                    final Map<String, String> userMetadata = new HashMap<>();
                    for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
                        if (entry.getKey().isDynamic()) {
                            final String value = context.getProperty(entry.getKey()).evaluateAttributeExpressions(ff).getValue();
                            userMetadata.put(entry.getKey().getName(), value);
                        }
                    }
                    final String serverSideEncryption = context.getProperty(SERVER_SIDE_ENCRYPTION).getValue();
                    if (!serverSideEncryption.equals(NO_SERVER_SIDE_ENCRYPTION)) {
                        objectMetadata.setSSEAlgorithm(serverSideEncryption);
                        attributes.put(S3_SSE_ALGORITHM, serverSideEncryption);
                    }
                    if (!userMetadata.isEmpty()) {
                        objectMetadata.setUserMetadata(userMetadata);
                    }
                    if (ff.getSize() <= multipartThreshold) {
                        // ----------------------------------------
                        // single part upload
                        // ----------------------------------------
                        final PutObjectRequest request = new PutObjectRequest(bucket, key, in, objectMetadata);
                        request.setStorageClass(StorageClass.valueOf(context.getProperty(STORAGE_CLASS).getValue()));
                        final AccessControlList acl = createACL(context, ff);
                        if (acl != null) {
                            request.setAccessControlList(acl);
                        }
                        final CannedAccessControlList cannedAcl = createCannedACL(context, ff);
                        if (cannedAcl != null) {
                            request.withCannedAcl(cannedAcl);
                        }
                        try {
                            final PutObjectResult result = s3.putObject(request);
                            if (result.getVersionId() != null) {
                                attributes.put(S3_VERSION_ATTR_KEY, result.getVersionId());
                            }
                            if (result.getETag() != null) {
                                attributes.put(S3_ETAG_ATTR_KEY, result.getETag());
                            }
                            if (result.getExpirationTime() != null) {
                                attributes.put(S3_EXPIRATION_ATTR_KEY, result.getExpirationTime().toString());
                            }
                            if (result.getMetadata().getRawMetadata().keySet().contains(S3_STORAGECLASS_META_KEY)) {
                                attributes.put(S3_STORAGECLASS_ATTR_KEY, result.getMetadata().getRawMetadataValue(S3_STORAGECLASS_META_KEY).toString());
                            }
                            if (userMetadata.size() > 0) {
                                StringBuilder userMetaBldr = new StringBuilder();
                                for (String userKey : userMetadata.keySet()) {
                                    userMetaBldr.append(userKey).append("=").append(userMetadata.get(userKey));
                                }
                                attributes.put(S3_USERMETA_ATTR_KEY, userMetaBldr.toString());
                            }
                            attributes.put(S3_API_METHOD_ATTR_KEY, S3_API_METHOD_PUTOBJECT);
                        } catch (AmazonClientException e) {
                            getLogger().info("Failure completing upload flowfile={} bucket={} key={} reason={}", new Object[] { ffFilename, bucket, key, e.getMessage() });
                            throw (e);
                        }
                    } else {
                        // ----------------------------------------
                        // multipart upload
                        // ----------------------------------------
                        // load or create persistent state
                        // ------------------------------------------------------------
                        MultipartState currentState;
                        try {
                            currentState = getLocalStateIfInS3(s3, bucket, cacheKey);
                            if (currentState != null) {
                                if (currentState.getPartETags().size() > 0) {
                                    final PartETag lastETag = currentState.getPartETags().get(currentState.getPartETags().size() - 1);
                                    getLogger().info("Resuming upload for flowfile='{}' bucket='{}' key='{}' " + "uploadID='{}' filePosition='{}' partSize='{}' storageClass='{}' " + "contentLength='{}' partsLoaded={} lastPart={}/{}", new Object[] { ffFilename, bucket, key, currentState.getUploadId(), currentState.getFilePosition(), currentState.getPartSize(), currentState.getStorageClass().toString(), currentState.getContentLength(), currentState.getPartETags().size(), Integer.toString(lastETag.getPartNumber()), lastETag.getETag() });
                                } else {
                                    getLogger().info("Resuming upload for flowfile='{}' bucket='{}' key='{}' " + "uploadID='{}' filePosition='{}' partSize='{}' storageClass='{}' " + "contentLength='{}' no partsLoaded", new Object[] { ffFilename, bucket, key, currentState.getUploadId(), currentState.getFilePosition(), currentState.getPartSize(), currentState.getStorageClass().toString(), currentState.getContentLength() });
                                }
                            } else {
                                currentState = new MultipartState();
                                currentState.setPartSize(multipartPartSize);
                                currentState.setStorageClass(StorageClass.valueOf(context.getProperty(STORAGE_CLASS).getValue()));
                                currentState.setContentLength(ff.getSize());
                                persistLocalState(cacheKey, currentState);
                                getLogger().info("Starting new upload for flowfile='{}' bucket='{}' key='{}'", new Object[] { ffFilename, bucket, key });
                            }
                        } catch (IOException e) {
                            getLogger().error("IOException initiating cache state while processing flow files: " + e.getMessage());
                            throw (e);
                        }
                        // ------------------------------------------------------------
                        if (currentState.getUploadId().isEmpty()) {
                            final InitiateMultipartUploadRequest initiateRequest = new InitiateMultipartUploadRequest(bucket, key, objectMetadata);
                            initiateRequest.setStorageClass(currentState.getStorageClass());
                            final AccessControlList acl = createACL(context, ff);
                            if (acl != null) {
                                initiateRequest.setAccessControlList(acl);
                            }
                            final CannedAccessControlList cannedAcl = createCannedACL(context, ff);
                            if (cannedAcl != null) {
                                initiateRequest.withCannedACL(cannedAcl);
                            }
                            try {
                                final InitiateMultipartUploadResult initiateResult = s3.initiateMultipartUpload(initiateRequest);
                                currentState.setUploadId(initiateResult.getUploadId());
                                currentState.getPartETags().clear();
                                try {
                                    persistLocalState(cacheKey, currentState);
                                } catch (Exception e) {
                                    getLogger().info("Exception saving cache state while processing flow file: " + e.getMessage());
                                    throw (new ProcessException("Exception saving cache state", e));
                                }
                                getLogger().info("Success initiating upload flowfile={} available={} position={} " + "length={} bucket={} key={} uploadId={}", new Object[] { ffFilename, in.available(), currentState.getFilePosition(), currentState.getContentLength(), bucket, key, currentState.getUploadId() });
                                if (initiateResult.getUploadId() != null) {
                                    attributes.put(S3_UPLOAD_ID_ATTR_KEY, initiateResult.getUploadId());
                                }
                            } catch (AmazonClientException e) {
                                getLogger().info("Failure initiating upload flowfile={} bucket={} key={} reason={}", new Object[] { ffFilename, bucket, key, e.getMessage() });
                                throw (e);
                            }
                        } else {
                            if (currentState.getFilePosition() > 0) {
                                try {
                                    final long skipped = in.skip(currentState.getFilePosition());
                                    if (skipped != currentState.getFilePosition()) {
                                        getLogger().info("Failure skipping to resume upload flowfile={} " + "bucket={} key={} position={} skipped={}", new Object[] { ffFilename, bucket, key, currentState.getFilePosition(), skipped });
                                    }
                                } catch (Exception e) {
                                    getLogger().info("Failure skipping to resume upload flowfile={} bucket={} " + "key={} position={} reason={}", new Object[] { ffFilename, bucket, key, currentState.getFilePosition(), e.getMessage() });
                                    throw (new ProcessException(e));
                                }
                            }
                        }
                        // upload parts
                        // ------------------------------------------------------------
                        long thisPartSize;
                        for (int part = currentState.getPartETags().size() + 1; currentState.getFilePosition() < currentState.getContentLength(); part++) {
                            if (!PutS3Object.this.isScheduled()) {
                                throw new IOException(S3_PROCESS_UNSCHEDULED_MESSAGE + " flowfile=" + ffFilename + " part=" + part + " uploadId=" + currentState.getUploadId());
                            }
                            thisPartSize = Math.min(currentState.getPartSize(), (currentState.getContentLength() - currentState.getFilePosition()));
                            UploadPartRequest uploadRequest = new UploadPartRequest().withBucketName(bucket).withKey(key).withUploadId(currentState.getUploadId()).withInputStream(in).withPartNumber(part).withPartSize(thisPartSize);
                            try {
                                UploadPartResult uploadPartResult = s3.uploadPart(uploadRequest);
                                currentState.addPartETag(uploadPartResult.getPartETag());
                                currentState.setFilePosition(currentState.getFilePosition() + thisPartSize);
                                try {
                                    persistLocalState(cacheKey, currentState);
                                } catch (Exception e) {
                                    getLogger().info("Exception saving cache state processing flow file: " + e.getMessage());
                                }
                                getLogger().info("Success uploading part flowfile={} part={} available={} " + "etag={} uploadId={}", new Object[] { ffFilename, part, in.available(), uploadPartResult.getETag(), currentState.getUploadId() });
                            } catch (AmazonClientException e) {
                                getLogger().info("Failure uploading part flowfile={} part={} bucket={} key={} " + "reason={}", new Object[] { ffFilename, part, bucket, key, e.getMessage() });
                                throw (e);
                            }
                        }
                        // complete multipart upload
                        // ------------------------------------------------------------
                        CompleteMultipartUploadRequest completeRequest = new CompleteMultipartUploadRequest(bucket, key, currentState.getUploadId(), currentState.getPartETags());
                        try {
                            CompleteMultipartUploadResult completeResult = s3.completeMultipartUpload(completeRequest);
                            getLogger().info("Success completing upload flowfile={} etag={} uploadId={}", new Object[] { ffFilename, completeResult.getETag(), currentState.getUploadId() });
                            if (completeResult.getVersionId() != null) {
                                attributes.put(S3_VERSION_ATTR_KEY, completeResult.getVersionId());
                            }
                            if (completeResult.getETag() != null) {
                                attributes.put(S3_ETAG_ATTR_KEY, completeResult.getETag());
                            }
                            if (completeResult.getExpirationTime() != null) {
                                attributes.put(S3_EXPIRATION_ATTR_KEY, completeResult.getExpirationTime().toString());
                            }
                            if (currentState.getStorageClass() != null) {
                                attributes.put(S3_STORAGECLASS_ATTR_KEY, currentState.getStorageClass().toString());
                            }
                            if (userMetadata.size() > 0) {
                                StringBuilder userMetaBldr = new StringBuilder();
                                for (String userKey : userMetadata.keySet()) {
                                    userMetaBldr.append(userKey).append("=").append(userMetadata.get(userKey));
                                }
                                attributes.put(S3_USERMETA_ATTR_KEY, userMetaBldr.toString());
                            }
                            attributes.put(S3_API_METHOD_ATTR_KEY, S3_API_METHOD_MULTIPARTUPLOAD);
                        } catch (AmazonClientException e) {
                            getLogger().info("Failure completing upload flowfile={} bucket={} key={} reason={}", new Object[] { ffFilename, bucket, key, e.getMessage() });
                            throw (e);
                        }
                    }
                }
            }
        });
        if (!attributes.isEmpty()) {
            flowFile = session.putAllAttributes(flowFile, attributes);
        }
        session.transfer(flowFile, REL_SUCCESS);
        final String url = s3.getResourceUrl(bucket, key);
        final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos);
        session.getProvenanceReporter().send(flowFile, url, millis);
        getLogger().info("Successfully put {} to Amazon S3 in {} milliseconds", new Object[] { ff, millis });
        try {
            removeLocalState(cacheKey);
        } catch (IOException e) {
            getLogger().info("Error trying to delete key {} from cache: {}", new Object[] { cacheKey, e.getMessage() });
        }
    } catch (final ProcessException | AmazonClientException pe) {
        if (pe.getMessage().contains(S3_PROCESS_UNSCHEDULED_MESSAGE)) {
            getLogger().info(pe.getMessage());
            session.rollback();
        } else {
            getLogger().error("Failed to put {} to Amazon S3 due to {}", new Object[] { flowFile, pe });
            flowFile = session.penalize(flowFile);
            session.transfer(flowFile, REL_FAILURE);
        }
    }
}
Also used : CannedAccessControlList(com.amazonaws.services.s3.model.CannedAccessControlList) AccessControlList(com.amazonaws.services.s3.model.AccessControlList) InitiateMultipartUploadResult(com.amazonaws.services.s3.model.InitiateMultipartUploadResult) HashMap(java.util.HashMap) AmazonClientException(com.amazonaws.AmazonClientException) CompleteMultipartUploadResult(com.amazonaws.services.s3.model.CompleteMultipartUploadResult) PartETag(com.amazonaws.services.s3.model.PartETag) UploadPartResult(com.amazonaws.services.s3.model.UploadPartResult) Entry(java.util.Map.Entry) BufferedInputStream(java.io.BufferedInputStream) PutObjectRequest(com.amazonaws.services.s3.model.PutObjectRequest) FlowFile(org.apache.nifi.flowfile.FlowFile) PutObjectResult(com.amazonaws.services.s3.model.PutObjectResult) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) InitiateMultipartUploadRequest(com.amazonaws.services.s3.model.InitiateMultipartUploadRequest) UploadPartRequest(com.amazonaws.services.s3.model.UploadPartRequest) IOException(java.io.IOException) CannedAccessControlList(com.amazonaws.services.s3.model.CannedAccessControlList) AmazonClientException(com.amazonaws.AmazonClientException) ProcessException(org.apache.nifi.processor.exception.ProcessException) AmazonS3Exception(com.amazonaws.services.s3.model.AmazonS3Exception) IOException(java.io.IOException) AmazonS3Client(com.amazonaws.services.s3.AmazonS3Client) ProcessException(org.apache.nifi.processor.exception.ProcessException) AtomicLong(java.util.concurrent.atomic.AtomicLong) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) ObjectMetadata(com.amazonaws.services.s3.model.ObjectMetadata) Map(java.util.Map) HashMap(java.util.HashMap) CompleteMultipartUploadRequest(com.amazonaws.services.s3.model.CompleteMultipartUploadRequest)

Example 77 with InputStreamCallback

use of org.apache.nifi.processor.io.InputStreamCallback in project nifi by apache.

the class PutHDFS method onTrigger.

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final FileSystem hdfs = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();
    if (configuration == null || hdfs == null || ugi == null) {
        getLogger().error("HDFS not configured properly");
        session.transfer(flowFile, REL_FAILURE);
        context.yield();
        return;
    }
    ugi.doAs(new PrivilegedAction<Object>() {

        @Override
        public Object run() {
            Path tempDotCopyFile = null;
            FlowFile putFlowFile = flowFile;
            try {
                final String dirValue = context.getProperty(DIRECTORY).evaluateAttributeExpressions(putFlowFile).getValue();
                final Path configuredRootDirPath = new Path(dirValue);
                final String conflictResponse = context.getProperty(CONFLICT_RESOLUTION).getValue();
                final Double blockSizeProp = context.getProperty(BLOCK_SIZE).asDataSize(DataUnit.B);
                final long blockSize = blockSizeProp != null ? blockSizeProp.longValue() : hdfs.getDefaultBlockSize(configuredRootDirPath);
                final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
                final int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue() : configuration.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);
                final Integer replicationProp = context.getProperty(REPLICATION_FACTOR).asInteger();
                final short replication = replicationProp != null ? replicationProp.shortValue() : hdfs.getDefaultReplication(configuredRootDirPath);
                final CompressionCodec codec = getCompressionCodec(context, configuration);
                final String filename = codec != null ? putFlowFile.getAttribute(CoreAttributes.FILENAME.key()) + codec.getDefaultExtension() : putFlowFile.getAttribute(CoreAttributes.FILENAME.key());
                final Path tempCopyFile = new Path(configuredRootDirPath, "." + filename);
                final Path copyFile = new Path(configuredRootDirPath, filename);
                // Create destination directory if it does not exist
                try {
                    if (!hdfs.getFileStatus(configuredRootDirPath).isDirectory()) {
                        throw new IOException(configuredRootDirPath.toString() + " already exists and is not a directory");
                    }
                } catch (FileNotFoundException fe) {
                    if (!hdfs.mkdirs(configuredRootDirPath)) {
                        throw new IOException(configuredRootDirPath.toString() + " could not be created");
                    }
                    changeOwner(context, hdfs, configuredRootDirPath, flowFile);
                }
                final boolean destinationExists = hdfs.exists(copyFile);
                // If destination file already exists, resolve that based on processor configuration
                if (destinationExists) {
                    switch(conflictResponse) {
                        case REPLACE_RESOLUTION:
                            if (hdfs.delete(copyFile, false)) {
                                getLogger().info("deleted {} in order to replace with the contents of {}", new Object[] { copyFile, putFlowFile });
                            }
                            break;
                        case IGNORE_RESOLUTION:
                            session.transfer(putFlowFile, REL_SUCCESS);
                            getLogger().info("transferring {} to success because file with same name already exists", new Object[] { putFlowFile });
                            return null;
                        case FAIL_RESOLUTION:
                            session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                            getLogger().warn("penalizing {} and routing to failure because file with same name already exists", new Object[] { putFlowFile });
                            return null;
                        default:
                            break;
                    }
                }
                // Write FlowFile to temp file on HDFS
                final StopWatch stopWatch = new StopWatch(true);
                session.read(putFlowFile, new InputStreamCallback() {

                    @Override
                    public void process(InputStream in) throws IOException {
                        OutputStream fos = null;
                        Path createdFile = null;
                        try {
                            if (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && destinationExists) {
                                fos = hdfs.append(copyFile, bufferSize);
                            } else {
                                fos = hdfs.create(tempCopyFile, true, bufferSize, replication, blockSize);
                            }
                            if (codec != null) {
                                fos = codec.createOutputStream(fos);
                            }
                            createdFile = tempCopyFile;
                            BufferedInputStream bis = new BufferedInputStream(in);
                            StreamUtils.copy(bis, fos);
                            bis = null;
                            fos.flush();
                        } finally {
                            try {
                                if (fos != null) {
                                    fos.close();
                                }
                            } catch (RemoteException re) {
                                // when talking to remote HDFS clusters, we don't notice problems until fos.close()
                                if (createdFile != null) {
                                    try {
                                        hdfs.delete(createdFile, false);
                                    } catch (Throwable ignore) {
                                    }
                                }
                                throw re;
                            } catch (Throwable ignore) {
                            }
                            fos = null;
                        }
                    }
                });
                stopWatch.stop();
                final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize());
                final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
                tempDotCopyFile = tempCopyFile;
                if (!conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) || (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && !destinationExists)) {
                    boolean renamed = false;
                    for (int i = 0; i < 10; i++) {
                        // try to rename multiple times.
                        if (hdfs.rename(tempCopyFile, copyFile)) {
                            renamed = true;
                            // rename was successful
                            break;
                        }
                        // try waiting to let whatever might cause rename failure to resolve
                        Thread.sleep(200L);
                    }
                    if (!renamed) {
                        hdfs.delete(tempCopyFile, false);
                        throw new ProcessException("Copied file to HDFS but could not rename dot file " + tempCopyFile + " to its final filename");
                    }
                    changeOwner(context, hdfs, copyFile, flowFile);
                }
                getLogger().info("copied {} to HDFS at {} in {} milliseconds at a rate of {}", new Object[] { putFlowFile, copyFile, millis, dataRate });
                final String newFilename = copyFile.getName();
                final String hdfsPath = copyFile.getParent().toString();
                putFlowFile = session.putAttribute(putFlowFile, CoreAttributes.FILENAME.key(), newFilename);
                putFlowFile = session.putAttribute(putFlowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
                final Path qualifiedPath = copyFile.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
                session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString());
                session.transfer(putFlowFile, REL_SUCCESS);
            } catch (final Throwable t) {
                if (tempDotCopyFile != null) {
                    try {
                        hdfs.delete(tempDotCopyFile, false);
                    } catch (Exception e) {
                        getLogger().error("Unable to remove temporary file {} due to {}", new Object[] { tempDotCopyFile, e });
                    }
                }
                getLogger().error("Failed to write to HDFS due to {}", new Object[] { t });
                session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                context.yield();
            }
            return null;
        }
    });
}
Also used : Path(org.apache.hadoop.fs.Path) FlowFile(org.apache.nifi.flowfile.FlowFile) Configuration(org.apache.hadoop.conf.Configuration) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) FileNotFoundException(java.io.FileNotFoundException) IOException(java.io.IOException) ProcessException(org.apache.nifi.processor.exception.ProcessException) IOException(java.io.IOException) RemoteException(org.apache.hadoop.ipc.RemoteException) FileNotFoundException(java.io.FileNotFoundException) StopWatch(org.apache.nifi.util.StopWatch) ProcessException(org.apache.nifi.processor.exception.ProcessException) BufferedInputStream(java.io.BufferedInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) RemoteException(org.apache.hadoop.ipc.RemoteException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 78 with InputStreamCallback

use of org.apache.nifi.processor.io.InputStreamCallback in project nifi by apache.

the class PutGCSObject method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final long startNanos = System.nanoTime();
    final String bucket = context.getProperty(BUCKET).evaluateAttributeExpressions(flowFile).getValue();
    final String key = context.getProperty(KEY).evaluateAttributeExpressions(flowFile).getValue();
    final boolean overwrite = context.getProperty(OVERWRITE).asBoolean();
    final FlowFile ff = flowFile;
    final String ffFilename = ff.getAttributes().get(CoreAttributes.FILENAME.key());
    final Map<String, String> attributes = new HashMap<>();
    try {
        final Storage storage = getCloudService();
        session.read(flowFile, new InputStreamCallback() {

            @Override
            public void process(InputStream rawIn) throws IOException {
                try (final InputStream in = new BufferedInputStream(rawIn)) {
                    final BlobId id = BlobId.of(bucket, key);
                    final BlobInfo.Builder blobInfoBuilder = BlobInfo.newBuilder(id);
                    final List<Storage.BlobWriteOption> blobWriteOptions = new ArrayList<>();
                    if (!overwrite) {
                        blobWriteOptions.add(Storage.BlobWriteOption.doesNotExist());
                    }
                    final String contentDispositionType = context.getProperty(CONTENT_DISPOSITION_TYPE).getValue();
                    if (contentDispositionType != null) {
                        blobInfoBuilder.setContentDisposition(contentDispositionType + "; filename=" + ffFilename);
                    }
                    final String contentType = context.getProperty(CONTENT_TYPE).evaluateAttributeExpressions(ff).getValue();
                    if (contentType != null) {
                        blobInfoBuilder.setContentType(contentType);
                    }
                    final String md5 = context.getProperty(MD5).evaluateAttributeExpressions(ff).getValue();
                    if (md5 != null) {
                        blobInfoBuilder.setMd5(md5);
                        blobWriteOptions.add(Storage.BlobWriteOption.md5Match());
                    }
                    final String crc32c = context.getProperty(CRC32C).evaluateAttributeExpressions(ff).getValue();
                    if (crc32c != null) {
                        blobInfoBuilder.setCrc32c(crc32c);
                        blobWriteOptions.add(Storage.BlobWriteOption.crc32cMatch());
                    }
                    final String acl = context.getProperty(ACL).getValue();
                    if (acl != null) {
                        blobWriteOptions.add(Storage.BlobWriteOption.predefinedAcl(Storage.PredefinedAcl.valueOf(acl)));
                    }
                    final String encryptionKey = context.getProperty(ENCRYPTION_KEY).evaluateAttributeExpressions(ff).getValue();
                    if (encryptionKey != null) {
                        blobWriteOptions.add(Storage.BlobWriteOption.encryptionKey(encryptionKey));
                    }
                    final HashMap<String, String> userMetadata = new HashMap<>();
                    for (final Map.Entry<PropertyDescriptor, String> entry : context.getProperties().entrySet()) {
                        if (entry.getKey().isDynamic()) {
                            final String value = context.getProperty(entry.getKey()).evaluateAttributeExpressions(ff).getValue();
                            userMetadata.put(entry.getKey().getName(), value);
                        }
                    }
                    if (!userMetadata.isEmpty()) {
                        blobInfoBuilder.setMetadata(userMetadata);
                    }
                    try {
                        final Blob blob = storage.create(blobInfoBuilder.build(), in, blobWriteOptions.toArray(new Storage.BlobWriteOption[blobWriteOptions.size()]));
                        // Create attributes
                        attributes.put(BUCKET_ATTR, blob.getBucket());
                        attributes.put(KEY_ATTR, blob.getName());
                        if (blob.getSize() != null) {
                            attributes.put(SIZE_ATTR, String.valueOf(blob.getSize()));
                        }
                        if (blob.getCacheControl() != null) {
                            attributes.put(CACHE_CONTROL_ATTR, blob.getCacheControl());
                        }
                        if (blob.getComponentCount() != null) {
                            attributes.put(COMPONENT_COUNT_ATTR, String.valueOf(blob.getComponentCount()));
                        }
                        if (blob.getContentDisposition() != null) {
                            attributes.put(CONTENT_DISPOSITION_ATTR, blob.getContentDisposition());
                            final Util.ParsedContentDisposition parsed = Util.parseContentDisposition(blob.getContentDisposition());
                            if (parsed != null) {
                                attributes.put(CoreAttributes.FILENAME.key(), parsed.getFileName());
                            }
                        }
                        if (blob.getContentEncoding() != null) {
                            attributes.put(CONTENT_ENCODING_ATTR, blob.getContentEncoding());
                        }
                        if (blob.getContentLanguage() != null) {
                            attributes.put(CONTENT_LANGUAGE_ATTR, blob.getContentLanguage());
                        }
                        if (blob.getContentType() != null) {
                            attributes.put(CoreAttributes.MIME_TYPE.key(), blob.getContentType());
                        }
                        if (blob.getCrc32c() != null) {
                            attributes.put(CRC32C_ATTR, blob.getCrc32c());
                        }
                        if (blob.getCustomerEncryption() != null) {
                            final BlobInfo.CustomerEncryption encryption = blob.getCustomerEncryption();
                            attributes.put(ENCRYPTION_ALGORITHM_ATTR, encryption.getEncryptionAlgorithm());
                            attributes.put(ENCRYPTION_SHA256_ATTR, encryption.getKeySha256());
                        }
                        if (blob.getEtag() != null) {
                            attributes.put(ETAG_ATTR, blob.getEtag());
                        }
                        if (blob.getGeneratedId() != null) {
                            attributes.put(GENERATED_ID_ATTR, blob.getGeneratedId());
                        }
                        if (blob.getGeneration() != null) {
                            attributes.put(GENERATION_ATTR, String.valueOf(blob.getGeneration()));
                        }
                        if (blob.getMd5() != null) {
                            attributes.put(MD5_ATTR, blob.getMd5());
                        }
                        if (blob.getMediaLink() != null) {
                            attributes.put(MEDIA_LINK_ATTR, blob.getMediaLink());
                        }
                        if (blob.getMetageneration() != null) {
                            attributes.put(METAGENERATION_ATTR, String.valueOf(blob.getMetageneration()));
                        }
                        if (blob.getOwner() != null) {
                            final Acl.Entity entity = blob.getOwner();
                            if (entity instanceof Acl.User) {
                                attributes.put(OWNER_ATTR, ((Acl.User) entity).getEmail());
                                attributes.put(OWNER_TYPE_ATTR, "user");
                            } else if (entity instanceof Acl.Group) {
                                attributes.put(OWNER_ATTR, ((Acl.Group) entity).getEmail());
                                attributes.put(OWNER_TYPE_ATTR, "group");
                            } else if (entity instanceof Acl.Domain) {
                                attributes.put(OWNER_ATTR, ((Acl.Domain) entity).getDomain());
                                attributes.put(OWNER_TYPE_ATTR, "domain");
                            } else if (entity instanceof Acl.Project) {
                                attributes.put(OWNER_ATTR, ((Acl.Project) entity).getProjectId());
                                attributes.put(OWNER_TYPE_ATTR, "project");
                            }
                        }
                        if (blob.getSelfLink() != null) {
                            attributes.put(URI_ATTR, blob.getSelfLink());
                        }
                        if (blob.getCreateTime() != null) {
                            attributes.put(CREATE_TIME_ATTR, String.valueOf(blob.getCreateTime()));
                        }
                        if (blob.getUpdateTime() != null) {
                            attributes.put(UPDATE_TIME_ATTR, String.valueOf(blob.getUpdateTime()));
                        }
                    } catch (StorageException e) {
                        getLogger().error("Failure completing upload flowfile={} bucket={} key={} reason={}", new Object[] { ffFilename, bucket, key, e.getMessage() }, e);
                        throw (e);
                    }
                }
            }
        });
        if (!attributes.isEmpty()) {
            flowFile = session.putAllAttributes(flowFile, attributes);
        }
        session.transfer(flowFile, REL_SUCCESS);
        final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos);
        final String url = "https://" + bucket + ".storage.googleapis.com/" + key;
        session.getProvenanceReporter().send(flowFile, url, millis);
        getLogger().info("Successfully put {} to Google Cloud Storage in {} milliseconds", new Object[] { ff, millis });
    } catch (final ProcessException | StorageException e) {
        getLogger().error("Failed to put {} to Google Cloud Storage due to {}", new Object[] { flowFile, e.getMessage() }, e);
        flowFile = session.penalize(flowFile);
        session.transfer(flowFile, REL_FAILURE);
    }
}
Also used : HashMap(java.util.HashMap) BufferedInputStream(java.io.BufferedInputStream) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) FlowFile(org.apache.nifi.flowfile.FlowFile) Blob(com.google.cloud.storage.Blob) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException) Acl(com.google.cloud.storage.Acl) ProcessException(org.apache.nifi.processor.exception.ProcessException) Storage(com.google.cloud.storage.Storage) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) BlobId(com.google.cloud.storage.BlobId) StorageException(com.google.cloud.storage.StorageException)

Example 79 with InputStreamCallback

use of org.apache.nifi.processor.io.InputStreamCallback in project nifi by apache.

the class PutHBaseJSON method createPut.

@Override
protected PutFlowFile createPut(final ProcessSession session, final ProcessContext context, final FlowFile flowFile) {
    final String tableName = context.getProperty(TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue();
    final String rowId = context.getProperty(ROW_ID).evaluateAttributeExpressions(flowFile).getValue();
    final String rowFieldName = context.getProperty(ROW_FIELD_NAME).evaluateAttributeExpressions(flowFile).getValue();
    final String columnFamily = context.getProperty(COLUMN_FAMILY).evaluateAttributeExpressions(flowFile).getValue();
    final String timestampValue = context.getProperty(TIMESTAMP).evaluateAttributeExpressions(flowFile).getValue();
    final boolean extractRowId = !StringUtils.isBlank(rowFieldName);
    final String complexFieldStrategy = context.getProperty(COMPLEX_FIELD_STRATEGY).getValue();
    final String fieldEncodingStrategy = context.getProperty(FIELD_ENCODING_STRATEGY).getValue();
    final String rowIdEncodingStrategy = context.getProperty(ROW_ID_ENCODING_STRATEGY).getValue();
    final Long timestamp;
    if (!StringUtils.isBlank(timestampValue)) {
        try {
            timestamp = Long.valueOf(timestampValue);
        } catch (Exception e) {
            getLogger().error("Invalid timestamp value: " + timestampValue, e);
            return null;
        }
    } else {
        timestamp = null;
    }
    // Parse the JSON document
    final ObjectMapper mapper = new ObjectMapper();
    final AtomicReference<JsonNode> rootNodeRef = new AtomicReference<>(null);
    try {
        session.read(flowFile, new InputStreamCallback() {

            @Override
            public void process(final InputStream in) throws IOException {
                try (final InputStream bufferedIn = new BufferedInputStream(in)) {
                    rootNodeRef.set(mapper.readTree(bufferedIn));
                }
            }
        });
    } catch (final ProcessException pe) {
        getLogger().error("Failed to parse {} as JSON due to {}; routing to failure", new Object[] { flowFile, pe.toString() }, pe);
        return null;
    }
    final JsonNode rootNode = rootNodeRef.get();
    if (rootNode.isArray()) {
        getLogger().error("Root node of JSON must be a single document, found array for {}; routing to failure", new Object[] { flowFile });
        return null;
    }
    final Collection<PutColumn> columns = new ArrayList<>();
    final AtomicReference<String> rowIdHolder = new AtomicReference<>(null);
    // convert each field/value to a column for the put, skip over nulls and arrays
    final Iterator<String> fieldNames = rootNode.fieldNames();
    while (fieldNames.hasNext()) {
        final String fieldName = fieldNames.next();
        final AtomicReference<byte[]> fieldValueHolder = new AtomicReference<>(null);
        final JsonNode fieldNode = rootNode.get(fieldName);
        if (fieldNode.isNull()) {
            getLogger().debug("Skipping {} because value was null", new Object[] { fieldName });
        } else if (fieldNode.isValueNode()) {
            // for a value node we need to determine if we are storing the bytes of a string, or the bytes of actual types
            if (STRING_ENCODING_VALUE.equals(fieldEncodingStrategy)) {
                final byte[] valueBytes = clientService.toBytes(fieldNode.asText());
                fieldValueHolder.set(valueBytes);
            } else {
                fieldValueHolder.set(extractJNodeValue(fieldNode));
            }
        } else {
            // for non-null, non-value nodes, determine what to do based on the handling strategy
            switch(complexFieldStrategy) {
                case FAIL_VALUE:
                    getLogger().error("Complex value found for {}; routing to failure", new Object[] { fieldName });
                    return null;
                case WARN_VALUE:
                    getLogger().warn("Complex value found for {}; skipping", new Object[] { fieldName });
                    break;
                case TEXT_VALUE:
                    // use toString() here because asText() is only guaranteed to be supported on value nodes
                    // some other types of nodes, like ArrayNode, provide toString implementations
                    fieldValueHolder.set(clientService.toBytes(fieldNode.toString()));
                    break;
                case IGNORE_VALUE:
                    // silently skip
                    break;
                default:
                    break;
            }
        }
        // otherwise add a new column where the fieldName and fieldValue are the column qualifier and value
        if (fieldValueHolder.get() != null) {
            if (extractRowId && fieldName.equals(rowFieldName)) {
                rowIdHolder.set(fieldNode.asText());
            } else {
                final byte[] colFamBytes = columnFamily.getBytes(StandardCharsets.UTF_8);
                final byte[] colQualBytes = fieldName.getBytes(StandardCharsets.UTF_8);
                final byte[] colValBytes = fieldValueHolder.get();
                columns.add(new PutColumn(colFamBytes, colQualBytes, colValBytes, timestamp));
            }
        }
    }
    // log an error message so the user can see what the field names were and return null so it gets routed to failure
    if (extractRowId && rowIdHolder.get() == null) {
        final String fieldNameStr = StringUtils.join(rootNode.fieldNames(), ",");
        getLogger().error("Row ID field named '{}' not found in field names '{}'; routing to failure", new Object[] { rowFieldName, fieldNameStr });
        return null;
    }
    final String putRowId = (extractRowId ? rowIdHolder.get() : rowId);
    byte[] rowKeyBytes = getRow(putRowId, rowIdEncodingStrategy);
    return new PutFlowFile(tableName, rowKeyBytes, columns, flowFile);
}
Also used : BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) PutColumn(org.apache.nifi.hbase.put.PutColumn) ArrayList(java.util.ArrayList) JsonNode(com.fasterxml.jackson.databind.JsonNode) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) ProcessException(org.apache.nifi.processor.exception.ProcessException) IOException(java.io.IOException) PutFlowFile(org.apache.nifi.hbase.put.PutFlowFile) ProcessException(org.apache.nifi.processor.exception.ProcessException) BufferedInputStream(java.io.BufferedInputStream) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 80 with InputStreamCallback

use of org.apache.nifi.processor.io.InputStreamCallback in project nifi by apache.

the class AbstractHiveQLProcessor method getHiveQL.

/**
 * Determines the HiveQL statement that should be executed for the given FlowFile
 *
 * @param session  the session that can be used to access the given FlowFile
 * @param flowFile the FlowFile whose HiveQL statement should be executed
 * @return the HiveQL that is associated with the given FlowFile
 */
protected String getHiveQL(final ProcessSession session, final FlowFile flowFile, final Charset charset) {
    // Read the HiveQL from the FlowFile's content
    final byte[] buffer = new byte[(int) flowFile.getSize()];
    session.read(flowFile, new InputStreamCallback() {

        @Override
        public void process(final InputStream in) throws IOException {
            StreamUtils.fillBuffer(in, buffer);
        }
    });
    // Create the PreparedStatement to use for this FlowFile.
    return new String(buffer, charset);
}
Also used : InputStream(java.io.InputStream) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) IOException(java.io.IOException)

Aggregations

IOException (java.io.IOException)80 InputStream (java.io.InputStream)80 InputStreamCallback (org.apache.nifi.processor.io.InputStreamCallback)80 FlowFile (org.apache.nifi.flowfile.FlowFile)62 ProcessException (org.apache.nifi.processor.exception.ProcessException)35 ComponentLog (org.apache.nifi.logging.ComponentLog)27 HashMap (java.util.HashMap)25 AtomicReference (java.util.concurrent.atomic.AtomicReference)23 OutputStream (java.io.OutputStream)19 BufferedInputStream (java.io.BufferedInputStream)18 ArrayList (java.util.ArrayList)17 Map (java.util.Map)17 OutputStreamCallback (org.apache.nifi.processor.io.OutputStreamCallback)13 ByteArrayOutputStream (java.io.ByteArrayOutputStream)11 BufferedInputStream (org.apache.nifi.stream.io.BufferedInputStream)10 StopWatch (org.apache.nifi.util.StopWatch)10 HashSet (java.util.HashSet)9 Charset (java.nio.charset.Charset)8 FileInputStream (java.io.FileInputStream)7 ProcessSession (org.apache.nifi.processor.ProcessSession)7