use of org.apache.nifi.flowfile.FlowFile in project kylo by Teradata.
the class GetFeedsHistoryReindex method onTrigger.
@Override
public void onTrigger(ProcessContext context, ProcessSession session) {
final ComponentLog logger = getLog();
FlowFile flowFile = session.get();
if (flowFile == null) {
flowFile = session.create();
}
logger.debug("Checking for feeds requiring reindexing historical data");
try {
MetadataProviderService metadataProviderService = getMetadataService(context);
if ((metadataProviderService != null) && (metadataProviderService.getProvider() != null)) {
String dateTimeOfCheck = String.valueOf(DateTime.now(DateTimeZone.UTC));
FeedsForDataHistoryReindex feedsForHistoryReindexing = getMetadataService(context).getProvider().getFeedsForHistoryReindexing();
if (feedsForHistoryReindexing != null) {
logger.info("Found {} feeds requiring reindexing historical data", new Object[] { feedsForHistoryReindexing.getFeeds().size() });
if (feedsForHistoryReindexing.getFeedCount() > 0) {
for (Feed feedForHistoryReindexing : feedsForHistoryReindexing.getFeeds()) {
Map<String, String> attributes = new HashMap<>();
attributes.put(FEED_ID_FOR_HISTORY_REINDEX_KEY, feedForHistoryReindexing.getId());
attributes.put(FEED_SYSTEM_NAME_FOR_HISTORY_REINDEX_KEY, feedForHistoryReindexing.getSystemName());
attributes.put(FEED_CATEGORY_SYSTEM_NAME_FOR_HISTORY_REINDEX_KEY, feedForHistoryReindexing.getCategory().getSystemName());
attributes.put(FEED_STATUS_FOR_HISTORY_REINDEX_KEY, feedForHistoryReindexing.getCurrentHistoryReindexingStatus().getHistoryReindexingState().toString());
attributes.put(FEED_LAST_MODIFIED_UTC_FOR_HISTORY_REINDEX_KEY, feedForHistoryReindexing.getCurrentHistoryReindexingStatus().getLastModifiedTimestamp().toString());
attributes.put(FEEDS_TOTAL_COUNT_FOR_HISTORY_REINDEX_KEY, String.valueOf(feedsForHistoryReindexing.getFeedCount()));
attributes.put(FEEDS_TOTAL_IDS_FOR_HISTORY_REINDEX_KEY, feedsForHistoryReindexing.getFeedIds().toString());
attributes.put(FEEDS_CHECK_TIME_UTC_FOR_HISTORY_REINDEX_KEY, dateTimeOfCheck);
// all attributes from parent flow file copied except uuid, creates a FORK event
FlowFile feedFlowFile = session.create(flowFile);
feedFlowFile = session.putAllAttributes(feedFlowFile, attributes);
session.transfer(feedFlowFile, REL_FOUND);
logger.info("Flow file created for reindexing feed's historical data: feed id {}, category name {}, feed name {}", new Object[] { FEED_ID_FOR_HISTORY_REINDEX_KEY, FEED_CATEGORY_SYSTEM_NAME_FOR_HISTORY_REINDEX_KEY, FEED_SYSTEM_NAME_FOR_HISTORY_REINDEX_KEY });
}
flowFile = session.putAttribute(flowFile, FEEDS_TOTAL_COUNT_FOR_HISTORY_REINDEX_KEY, String.valueOf(feedsForHistoryReindexing.getFeedCount()));
flowFile = session.putAttribute(flowFile, FEEDS_CHECK_TIME_UTC_FOR_HISTORY_REINDEX_KEY, dateTimeOfCheck);
// only for found case
session.transfer(flowFile, REL_ORIGINAL);
} else {
// this will always be 0 here
flowFile = session.putAttribute(flowFile, FEEDS_TOTAL_COUNT_FOR_HISTORY_REINDEX_KEY, String.valueOf(feedsForHistoryReindexing.getFeedCount()));
// this will always be empty list here
flowFile = session.putAttribute(flowFile, FEEDS_TOTAL_IDS_FOR_HISTORY_REINDEX_KEY, feedsForHistoryReindexing.getFeedIds().toString());
flowFile = session.putAttribute(flowFile, FEEDS_CHECK_TIME_UTC_FOR_HISTORY_REINDEX_KEY, dateTimeOfCheck);
session.transfer(flowFile, REL_NOT_FOUND);
}
}
} else {
logger.error("Error checking for feeds requiring reindexing historical data. Check if Kylo services is running, and accessible from NiFi.");
session.transfer(flowFile, REL_FAILURE);
}
} catch (Exception e) {
logger.error("An exception was thrown during check for feeds requiring reindexing historical data: {}", new Object[] { e });
session.transfer(flowFile, REL_FAILURE);
}
}
use of org.apache.nifi.flowfile.FlowFile in project kylo by Teradata.
the class ExecutePySpark method onTrigger.
@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
final ComponentLog logger = getLog();
FlowFile flowFile = session.get();
if (flowFile == null) {
flowFile = session.create();
logger.info("Created a flow file having uuid: {}", new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) });
} else {
logger.info("Using an existing flow file having uuid: {}", new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) });
}
try {
final String kerberosPrincipal = context.getProperty(KERBEROS_PRINCIPAL).getValue();
final String kerberosKeyTab = context.getProperty(KERBEROS_KEYTAB).getValue();
final String hadoopConfigurationResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES).getValue();
final String pySparkAppFile = context.getProperty(PYSPARK_APP_FILE).evaluateAttributeExpressions(flowFile).getValue();
final String pySparkAppArgs = context.getProperty(PYSPARK_APP_ARGS).evaluateAttributeExpressions(flowFile).getValue();
final String pySparkAppName = context.getProperty(PYSPARK_APP_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String pySparkAdditionalFiles = context.getProperty(PYSPARK_ADDITIONAL_FILES).evaluateAttributeExpressions(flowFile).getValue();
final String sparkMaster = context.getProperty(SPARK_MASTER).evaluateAttributeExpressions(flowFile).getValue().trim().toLowerCase();
final String sparkYarnDeployMode = context.getProperty(SPARK_YARN_DEPLOY_MODE).evaluateAttributeExpressions(flowFile).getValue();
final String yarnQueue = context.getProperty(YARN_QUEUE).evaluateAttributeExpressions(flowFile).getValue();
final String sparkHome = context.getProperty(SPARK_HOME).evaluateAttributeExpressions(flowFile).getValue();
final String driverMemory = context.getProperty(DRIVER_MEMORY).evaluateAttributeExpressions(flowFile).getValue();
final String executorMemory = context.getProperty(EXECUTOR_MEMORY).evaluateAttributeExpressions(flowFile).getValue();
final String executorInstances = context.getProperty(EXECUTOR_INSTANCES).evaluateAttributeExpressions(flowFile).getValue();
final String executorCores = context.getProperty(EXECUTOR_CORES).evaluateAttributeExpressions(flowFile).getValue();
final String networkTimeout = context.getProperty(NETWORK_TIMEOUT).evaluateAttributeExpressions(flowFile).getValue();
final String additionalSparkConfigOptions = context.getProperty(ADDITIONAL_SPARK_CONFIG_OPTIONS).evaluateAttributeExpressions(flowFile).getValue();
PySparkUtils pySparkUtils = new PySparkUtils();
/* Get app arguments */
String[] pySparkAppArgsArray = null;
if (!StringUtils.isEmpty(pySparkAppArgs)) {
pySparkAppArgsArray = pySparkUtils.getCsvValuesAsArray(pySparkAppArgs);
logger.info("Provided application arguments: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) });
}
/* Get additional python files */
String[] pySparkAdditionalFilesArray = null;
if (!StringUtils.isEmpty(pySparkAdditionalFiles)) {
pySparkAdditionalFilesArray = pySparkUtils.getCsvValuesAsArray(pySparkAdditionalFiles);
logger.info("Provided python files: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAdditionalFilesArray) });
}
/* Get additional config key-value pairs */
String[] additionalSparkConfigOptionsArray = null;
if (!StringUtils.isEmpty(additionalSparkConfigOptions)) {
additionalSparkConfigOptionsArray = pySparkUtils.getCsvValuesAsArray(additionalSparkConfigOptions);
logger.info("Provided spark config options: {}", new Object[] { pySparkUtils.getCsvStringFromArray(additionalSparkConfigOptionsArray) });
}
/* Determine if Kerberos is enabled */
boolean kerberosEnabled = false;
if (!StringUtils.isEmpty(kerberosPrincipal) && !StringUtils.isEmpty(kerberosKeyTab) && !StringUtils.isEmpty(hadoopConfigurationResources)) {
kerberosEnabled = true;
logger.info("Kerberos is enabled");
}
/* For Kerberized cluster, attempt user authentication */
if (kerberosEnabled) {
logger.info("Attempting user authentication for Kerberos");
ApplySecurityPolicy applySecurityObject = new ApplySecurityPolicy();
Configuration configuration;
try {
logger.info("Getting Hadoop configuration from " + hadoopConfigurationResources);
configuration = ApplySecurityPolicy.getConfigurationFromResources(hadoopConfigurationResources);
if (SecurityUtil.isSecurityEnabled(configuration)) {
logger.info("Security is enabled");
if (kerberosPrincipal.equals("") && kerberosKeyTab.equals("")) {
logger.error("Kerberos Principal and Keytab provided with empty values for a Kerberized cluster.");
session.transfer(flowFile, REL_FAILURE);
return;
}
try {
logger.info("User authentication initiated");
boolean authenticationStatus = applySecurityObject.validateUserWithKerberos(logger, hadoopConfigurationResources, kerberosPrincipal, kerberosKeyTab);
if (authenticationStatus) {
logger.info("User authenticated successfully.");
} else {
logger.error("User authentication failed.");
session.transfer(flowFile, REL_FAILURE);
return;
}
} catch (Exception unknownException) {
logger.error("Unknown exception occurred while validating user :" + unknownException.getMessage());
session.transfer(flowFile, REL_FAILURE);
return;
}
}
} catch (IOException e1) {
logger.error("Unknown exception occurred while authenticating user :" + e1.getMessage());
session.transfer(flowFile, REL_FAILURE);
return;
}
}
/* Build and launch PySpark Job */
logger.info("Configuring PySpark job for execution");
SparkLauncher pySparkLauncher = new SparkLauncher().setAppResource(pySparkAppFile);
logger.info("PySpark app file set to: {}", new Object[] { pySparkAppFile });
if (pySparkAppArgsArray != null && pySparkAppArgsArray.length > 0) {
pySparkLauncher = pySparkLauncher.addAppArgs(pySparkAppArgsArray);
logger.info("App arguments set to: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) });
}
pySparkLauncher = pySparkLauncher.setAppName(pySparkAppName).setMaster(sparkMaster);
logger.info("App name set to: {}", new Object[] { pySparkAppName });
logger.info("Spark master set to: {}", new Object[] { sparkMaster });
if (pySparkAdditionalFilesArray != null && pySparkAdditionalFilesArray.length > 0) {
for (String pySparkAdditionalFile : pySparkAdditionalFilesArray) {
pySparkLauncher = pySparkLauncher.addPyFile(pySparkAdditionalFile);
logger.info("Additional python file set to: {}", new Object[] { pySparkAdditionalFile });
}
}
if (sparkMaster.equals("yarn")) {
pySparkLauncher = pySparkLauncher.setDeployMode(sparkYarnDeployMode);
logger.info("YARN deploy mode set to: {}", new Object[] { sparkYarnDeployMode });
}
pySparkLauncher = pySparkLauncher.setSparkHome(sparkHome).setConf(SparkLauncher.DRIVER_MEMORY, driverMemory).setConf(SparkLauncher.EXECUTOR_MEMORY, executorMemory).setConf(CONFIG_PROP_SPARK_EXECUTOR_INSTANCES, executorInstances).setConf(SparkLauncher.EXECUTOR_CORES, executorCores).setConf(CONFIG_PROP_SPARK_NETWORK_TIMEOUT, networkTimeout);
logger.info("Spark home set to: {} ", new Object[] { sparkHome });
logger.info("Driver memory set to: {} ", new Object[] { driverMemory });
logger.info("Executor memory set to: {} ", new Object[] { executorMemory });
logger.info("Executor instances set to: {} ", new Object[] { executorInstances });
logger.info("Executor cores set to: {} ", new Object[] { executorCores });
logger.info("Network timeout set to: {} ", new Object[] { networkTimeout });
if (kerberosEnabled) {
pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_PRINCIPAL, kerberosPrincipal);
pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_KEYTAB, kerberosKeyTab);
logger.info("Kerberos principal set to: {} ", new Object[] { kerberosPrincipal });
logger.info("Kerberos keytab set to: {} ", new Object[] { kerberosKeyTab });
}
if (!StringUtils.isEmpty(yarnQueue)) {
pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_QUEUE, yarnQueue);
logger.info("YARN queue set to: {} ", new Object[] { yarnQueue });
}
if (additionalSparkConfigOptionsArray != null && additionalSparkConfigOptionsArray.length > 0) {
for (String additionalSparkConfigOption : additionalSparkConfigOptionsArray) {
String[] confKeyValue = additionalSparkConfigOption.split("=");
if (confKeyValue.length == 2) {
pySparkLauncher = pySparkLauncher.setConf(confKeyValue[0], confKeyValue[1]);
logger.info("Spark additional config option set to: {}={}", new Object[] { confKeyValue[0], confKeyValue[1] });
}
}
}
logger.info("Starting execution of PySpark job");
Process pySparkProcess = pySparkLauncher.launch();
InputStreamReaderRunnable inputStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO, logger, pySparkProcess.getInputStream());
Thread inputThread = new Thread(inputStreamReaderRunnable, "stream input");
inputThread.start();
InputStreamReaderRunnable errorStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO, logger, pySparkProcess.getErrorStream());
Thread errorThread = new Thread(errorStreamReaderRunnable, "stream error");
errorThread.start();
logger.info("Waiting for PySpark job to complete");
int exitCode = pySparkProcess.waitFor();
if (exitCode != 0) {
logger.info("Finished execution of PySpark job [FAILURE] [Status code: {}]", new Object[] { exitCode });
session.transfer(flowFile, REL_FAILURE);
} else {
logger.info("Finished execution of PySpark job [SUCCESS] [Status code: {}]", new Object[] { exitCode });
session.transfer(flowFile, REL_SUCCESS);
}
} catch (final Exception e) {
logger.error("Unable to execute PySpark job [FAILURE]", new Object[] { flowFile, e });
session.transfer(flowFile, REL_FAILURE);
}
}
use of org.apache.nifi.flowfile.FlowFile in project kylo by Teradata.
the class MetadataClientRecorder method releaseWaterMark.
/* (non-Javadoc)
* @see com.thinkbiganalytics.nifi.core.api.metadata.MetadataRecorder#releaseWaterMark(org.apache.nifi.processor.ProcessSession, org.apache.nifi.flowfile.FlowFile, java.lang.String, java.lang.String)
*/
@Override
public FlowFile releaseWaterMark(ProcessSession session, FlowFile ff, String feedId, String waterMarkName) {
FlowFile resultFF = ff;
Map<String, WaterMarkParam> ffWaterMarks = getCurrentWaterMarksAttr(ff);
WaterMarkParam param = ffWaterMarks.get(waterMarkName);
try {
if (param != null) {
// Update the flowfile with the modified set of active water marks.
removeFromCurrentWaterMarksAttr(session, resultFF, waterMarkName, param.name);
resetWaterMarkParam(session, resultFF, feedId, waterMarkName, param.name);
} else {
log.warn("Received request to release a water mark not found in the flow file: {}", waterMarkName);
}
} finally {
// Even if water mark resetting fails we should always release the water mark.
Long activeTimestamp = getActiveWaterMarkTimestamp(feedId, waterMarkName);
if (activeTimestamp != null) {
if (param == null || param.timestamp == activeTimestamp) {
releaseActiveWaterMark(feedId, waterMarkName, activeTimestamp);
} else if (param.timestamp != activeTimestamp) {
// If the water mark timestamp does not match the one recorded as an active water mark this means
// this flowfile's water mark has been canceled and another flow file should be considered the active one.
// In this case this water mark value has been superseded and no release should occur.
log.info("Received request to release a water mark version that is no longer active: {}", waterMarkName);
}
} else {
// The water mark is not one recognize as an active one.
log.warn("Received request to release a non-active water mark: {}", waterMarkName);
}
}
return resultFF;
}
use of org.apache.nifi.flowfile.FlowFile in project kylo by Teradata.
the class MetadataClientRecorder method commitAllWaterMarks.
/* (non-Javadoc)
* @see com.thinkbiganalytics.nifi.core.api.metadata.MetadataRecorder#commitAllWaterMarks(org.apache.nifi.processor.ProcessSession, org.apache.nifi.flowfile.FlowFile, java.lang.String)
*/
@Override
public FlowFile commitAllWaterMarks(ProcessSession session, FlowFile ff, String feedId) {
FlowFile resultFF = ff;
Set<String> cancelledWaterMarks = new HashSet<>();
// TODO do more efficiently
for (String waterMarkName : new HashSet<String>(getCurrentWaterMarksAttr(ff).keySet())) {
try {
resultFF = commitWaterMark(session, resultFF, feedId, waterMarkName);
} catch (ActiveWaterMarksCancelledException e) {
cancelledWaterMarks.addAll(e.getWaterMarkNames());
}
}
if (cancelledWaterMarks.size() > 0) {
throw new ActiveWaterMarksCancelledException(feedId, cancelledWaterMarks);
} else {
return resultFF;
}
}
use of org.apache.nifi.flowfile.FlowFile in project kylo by Teradata.
the class SetSavepoint method onTrigger.
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final SavepointController controller = context.getProperty(SAVEPOINT_SERVICE).asControllerService(SavepointController.class);
final SavepointProvider provider = controller.getProvider();
final PropertyValue pvSavepointId = context.getProperty(SAVEPOINT_ID);
final String processorId = getIdentifier();
FlowFile flowFile = null;
long start = System.currentTimeMillis();
Optional<FlowFile> nextFlowfile = getNextFlowFile(context, session, controller, provider, pvSavepointId);
long stop = System.currentTimeMillis();
if (!nextFlowfile.isPresent()) {
return;
} else {
flowFile = nextFlowfile.get();
}
getLogger().info("Time to iterate over {} flow files: {} ms, {} ", new Object[] { session.getQueueSize(), (stop - start), nextFlowfile.isPresent() ? nextFlowfile.get() : " Nothing found " });
final ComponentLog logger = getLogger();
// We do processing on each flowfile here
final String savepointIdStr = pvSavepointId.evaluateAttributeExpressions(flowFile).getValue();
final String flowfileId = flowFile.getAttribute(CoreAttributes.UUID.key());
Lock lock = null;
try {
lock = provider.lock(savepointIdStr);
if (lock != null) {
SavepointEntry entry = provider.lookupEntry(savepointIdStr);
if (isExpired(context, session, provider, flowFile, savepointIdStr, lock)) {
return;
}
String waitStartTimestamp;
// add the processor id for the current savepoint
// this will be used to check on the next save point if the flow file should be examined and processed.
flowFile = session.putAttribute(flowFile, SAVEPOINT_PROCESSOR_ID, getIdentifier());
if (entry == null || entry.getState(processorId) == null) {
// Register new
provider.register(savepointIdStr, processorId, flowfileId, lock);
flowFile = tryFlowFile(session, flowFile, "-1");
// add in timestamps
// Set wait start timestamp if it's not set yet
waitStartTimestamp = flowFile.getAttribute(SAVEPOINT_START_TIMESTAMP);
if (waitStartTimestamp == null) {
waitStartTimestamp = String.valueOf(System.currentTimeMillis());
flowFile = session.putAttribute(flowFile, SAVEPOINT_START_TIMESTAMP, waitStartTimestamp);
}
session.transfer(flowFile);
} else {
SavepointEntry.SavePointState state = entry.getState(processorId);
switch(state) {
case RELEASE_SUCCESS:
provider.commitRelease(savepointIdStr, processorId, lock);
// add provenance to indicate success
flowFile = session.putAttribute(flowFile, SavepointProvenanceProperties.RELEASE_STATUS_KEY, SavepointProvenanceProperties.RELEASE_STATUS.SUCCESS.name());
session.transfer(flowFile, REL_RELEASE_SUCCESS);
break;
case RELEASE_FAILURE:
provider.commitRelease(savepointIdStr, processorId, lock);
// add provenance to indicate failure
flowFile = session.putAttribute(flowFile, SavepointProvenanceProperties.RELEASE_STATUS_KEY, SavepointProvenanceProperties.RELEASE_STATUS.FAILURE.name());
session.transfer(flowFile, REL_RELEASE_FAILURE);
break;
case RETRY:
String retryCount = flowFile.getAttribute(SAVEPOINT_RETRY_COUNT);
if (retryCount == null) {
retryCount = "0";
}
provider.commitRetry(savepointIdStr, processorId, lock);
flowFile = tryFlowFile(session, flowFile, retryCount);
session.transfer(flowFile);
break;
case WAIT:
session.transfer(flowFile, REL_SELF);
break;
default:
logger.warn("Unexpected savepoint state.");
session.transfer(flowFile, REL_FAILURE);
}
}
} else {
// Lock busy so try again later
// add it back to cache
controller.putFlowfileBack(processorId, flowfileId);
logger.info("Unable to obtain lock. It is already locked by another process. Adding back to queue {} ", new Object[] { flowfileId });
session.transfer(flowFile, REL_SELF);
}
} catch (IOException | InvalidLockException | InvalidSetpointException e) {
logger.warn("Failed to process flowfile {} for savepoint {}", new String[] { flowfileId, savepointIdStr }, e);
flowFile = session.putAttribute(flowFile, SAVEPOINT_EXCEPTION, "Failed to process flowfile " + flowfileId + " for savepoint " + savepointIdStr + ". " + e.getMessage());
session.transfer(flowFile, REL_FAILURE);
} finally {
if (lock != null) {
try {
provider.unlock(lock);
} catch (IOException e) {
logger.warn("Unable to unlock {}", new String[] { savepointIdStr });
}
}
}
}
Aggregations