use of org.apache.spark.launcher.SparkLauncher in project kylo by Teradata.
the class ExecutePySpark method onTrigger.
@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
final ComponentLog logger = getLog();
FlowFile flowFile = session.get();
if (flowFile == null) {
flowFile = session.create();
logger.info("Created a flow file having uuid: {}", new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) });
} else {
logger.info("Using an existing flow file having uuid: {}", new Object[] { flowFile.getAttribute(CoreAttributes.UUID.key()) });
}
try {
final String kerberosPrincipal = context.getProperty(KERBEROS_PRINCIPAL).getValue();
final String kerberosKeyTab = context.getProperty(KERBEROS_KEYTAB).getValue();
final String hadoopConfigurationResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES).getValue();
final String pySparkAppFile = context.getProperty(PYSPARK_APP_FILE).evaluateAttributeExpressions(flowFile).getValue();
final String pySparkAppArgs = context.getProperty(PYSPARK_APP_ARGS).evaluateAttributeExpressions(flowFile).getValue();
final String pySparkAppName = context.getProperty(PYSPARK_APP_NAME).evaluateAttributeExpressions(flowFile).getValue();
final String pySparkAdditionalFiles = context.getProperty(PYSPARK_ADDITIONAL_FILES).evaluateAttributeExpressions(flowFile).getValue();
final String sparkMaster = context.getProperty(SPARK_MASTER).evaluateAttributeExpressions(flowFile).getValue().trim().toLowerCase();
final String sparkYarnDeployMode = context.getProperty(SPARK_YARN_DEPLOY_MODE).evaluateAttributeExpressions(flowFile).getValue();
final String yarnQueue = context.getProperty(YARN_QUEUE).evaluateAttributeExpressions(flowFile).getValue();
final String sparkHome = context.getProperty(SPARK_HOME).evaluateAttributeExpressions(flowFile).getValue();
final String driverMemory = context.getProperty(DRIVER_MEMORY).evaluateAttributeExpressions(flowFile).getValue();
final String executorMemory = context.getProperty(EXECUTOR_MEMORY).evaluateAttributeExpressions(flowFile).getValue();
final String executorInstances = context.getProperty(EXECUTOR_INSTANCES).evaluateAttributeExpressions(flowFile).getValue();
final String executorCores = context.getProperty(EXECUTOR_CORES).evaluateAttributeExpressions(flowFile).getValue();
final String networkTimeout = context.getProperty(NETWORK_TIMEOUT).evaluateAttributeExpressions(flowFile).getValue();
final String additionalSparkConfigOptions = context.getProperty(ADDITIONAL_SPARK_CONFIG_OPTIONS).evaluateAttributeExpressions(flowFile).getValue();
PySparkUtils pySparkUtils = new PySparkUtils();
/* Get app arguments */
String[] pySparkAppArgsArray = null;
if (!StringUtils.isEmpty(pySparkAppArgs)) {
pySparkAppArgsArray = pySparkUtils.getCsvValuesAsArray(pySparkAppArgs);
logger.info("Provided application arguments: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) });
}
/* Get additional python files */
String[] pySparkAdditionalFilesArray = null;
if (!StringUtils.isEmpty(pySparkAdditionalFiles)) {
pySparkAdditionalFilesArray = pySparkUtils.getCsvValuesAsArray(pySparkAdditionalFiles);
logger.info("Provided python files: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAdditionalFilesArray) });
}
/* Get additional config key-value pairs */
String[] additionalSparkConfigOptionsArray = null;
if (!StringUtils.isEmpty(additionalSparkConfigOptions)) {
additionalSparkConfigOptionsArray = pySparkUtils.getCsvValuesAsArray(additionalSparkConfigOptions);
logger.info("Provided spark config options: {}", new Object[] { pySparkUtils.getCsvStringFromArray(additionalSparkConfigOptionsArray) });
}
/* Determine if Kerberos is enabled */
boolean kerberosEnabled = false;
if (!StringUtils.isEmpty(kerberosPrincipal) && !StringUtils.isEmpty(kerberosKeyTab) && !StringUtils.isEmpty(hadoopConfigurationResources)) {
kerberosEnabled = true;
logger.info("Kerberos is enabled");
}
/* For Kerberized cluster, attempt user authentication */
if (kerberosEnabled) {
logger.info("Attempting user authentication for Kerberos");
ApplySecurityPolicy applySecurityObject = new ApplySecurityPolicy();
Configuration configuration;
try {
logger.info("Getting Hadoop configuration from " + hadoopConfigurationResources);
configuration = ApplySecurityPolicy.getConfigurationFromResources(hadoopConfigurationResources);
if (SecurityUtil.isSecurityEnabled(configuration)) {
logger.info("Security is enabled");
if (kerberosPrincipal.equals("") && kerberosKeyTab.equals("")) {
logger.error("Kerberos Principal and Keytab provided with empty values for a Kerberized cluster.");
session.transfer(flowFile, REL_FAILURE);
return;
}
try {
logger.info("User authentication initiated");
boolean authenticationStatus = applySecurityObject.validateUserWithKerberos(logger, hadoopConfigurationResources, kerberosPrincipal, kerberosKeyTab);
if (authenticationStatus) {
logger.info("User authenticated successfully.");
} else {
logger.error("User authentication failed.");
session.transfer(flowFile, REL_FAILURE);
return;
}
} catch (Exception unknownException) {
logger.error("Unknown exception occurred while validating user :" + unknownException.getMessage());
session.transfer(flowFile, REL_FAILURE);
return;
}
}
} catch (IOException e1) {
logger.error("Unknown exception occurred while authenticating user :" + e1.getMessage());
session.transfer(flowFile, REL_FAILURE);
return;
}
}
/* Build and launch PySpark Job */
logger.info("Configuring PySpark job for execution");
SparkLauncher pySparkLauncher = new SparkLauncher().setAppResource(pySparkAppFile);
logger.info("PySpark app file set to: {}", new Object[] { pySparkAppFile });
if (pySparkAppArgsArray != null && pySparkAppArgsArray.length > 0) {
pySparkLauncher = pySparkLauncher.addAppArgs(pySparkAppArgsArray);
logger.info("App arguments set to: {}", new Object[] { pySparkUtils.getCsvStringFromArray(pySparkAppArgsArray) });
}
pySparkLauncher = pySparkLauncher.setAppName(pySparkAppName).setMaster(sparkMaster);
logger.info("App name set to: {}", new Object[] { pySparkAppName });
logger.info("Spark master set to: {}", new Object[] { sparkMaster });
if (pySparkAdditionalFilesArray != null && pySparkAdditionalFilesArray.length > 0) {
for (String pySparkAdditionalFile : pySparkAdditionalFilesArray) {
pySparkLauncher = pySparkLauncher.addPyFile(pySparkAdditionalFile);
logger.info("Additional python file set to: {}", new Object[] { pySparkAdditionalFile });
}
}
if (sparkMaster.equals("yarn")) {
pySparkLauncher = pySparkLauncher.setDeployMode(sparkYarnDeployMode);
logger.info("YARN deploy mode set to: {}", new Object[] { sparkYarnDeployMode });
}
pySparkLauncher = pySparkLauncher.setSparkHome(sparkHome).setConf(SparkLauncher.DRIVER_MEMORY, driverMemory).setConf(SparkLauncher.EXECUTOR_MEMORY, executorMemory).setConf(CONFIG_PROP_SPARK_EXECUTOR_INSTANCES, executorInstances).setConf(SparkLauncher.EXECUTOR_CORES, executorCores).setConf(CONFIG_PROP_SPARK_NETWORK_TIMEOUT, networkTimeout);
logger.info("Spark home set to: {} ", new Object[] { sparkHome });
logger.info("Driver memory set to: {} ", new Object[] { driverMemory });
logger.info("Executor memory set to: {} ", new Object[] { executorMemory });
logger.info("Executor instances set to: {} ", new Object[] { executorInstances });
logger.info("Executor cores set to: {} ", new Object[] { executorCores });
logger.info("Network timeout set to: {} ", new Object[] { networkTimeout });
if (kerberosEnabled) {
pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_PRINCIPAL, kerberosPrincipal);
pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_KEYTAB, kerberosKeyTab);
logger.info("Kerberos principal set to: {} ", new Object[] { kerberosPrincipal });
logger.info("Kerberos keytab set to: {} ", new Object[] { kerberosKeyTab });
}
if (!StringUtils.isEmpty(yarnQueue)) {
pySparkLauncher = pySparkLauncher.setConf(CONFIG_PROP_SPARK_YARN_QUEUE, yarnQueue);
logger.info("YARN queue set to: {} ", new Object[] { yarnQueue });
}
if (additionalSparkConfigOptionsArray != null && additionalSparkConfigOptionsArray.length > 0) {
for (String additionalSparkConfigOption : additionalSparkConfigOptionsArray) {
String[] confKeyValue = additionalSparkConfigOption.split("=");
if (confKeyValue.length == 2) {
pySparkLauncher = pySparkLauncher.setConf(confKeyValue[0], confKeyValue[1]);
logger.info("Spark additional config option set to: {}={}", new Object[] { confKeyValue[0], confKeyValue[1] });
}
}
}
logger.info("Starting execution of PySpark job");
Process pySparkProcess = pySparkLauncher.launch();
InputStreamReaderRunnable inputStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO, logger, pySparkProcess.getInputStream());
Thread inputThread = new Thread(inputStreamReaderRunnable, "stream input");
inputThread.start();
InputStreamReaderRunnable errorStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO, logger, pySparkProcess.getErrorStream());
Thread errorThread = new Thread(errorStreamReaderRunnable, "stream error");
errorThread.start();
logger.info("Waiting for PySpark job to complete");
int exitCode = pySparkProcess.waitFor();
if (exitCode != 0) {
logger.info("Finished execution of PySpark job [FAILURE] [Status code: {}]", new Object[] { exitCode });
session.transfer(flowFile, REL_FAILURE);
} else {
logger.info("Finished execution of PySpark job [SUCCESS] [Status code: {}]", new Object[] { exitCode });
session.transfer(flowFile, REL_SUCCESS);
}
} catch (final Exception e) {
logger.error("Unable to execute PySpark job [FAILURE]", new Object[] { flowFile, e });
session.transfer(flowFile, REL_FAILURE);
}
}
use of org.apache.spark.launcher.SparkLauncher in project kylo by Teradata.
the class ExecuteSparkJob method onTrigger.
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
final ComponentLog logger = getLog();
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
String PROVENANCE_JOB_STATUS_KEY = "Job Status";
String PROVENANCE_SPARK_EXIT_CODE_KEY = "Spark Exit Code";
try {
PROVENANCE_JOB_STATUS_KEY = context.getName() + " Job Status";
PROVENANCE_SPARK_EXIT_CODE_KEY = context.getName() + " Spark Exit Code";
/* Configuration parameters for spark launcher */
String appJar = getApplicationJar(context, flowFile);
String mainClass = getMainClass(context, flowFile);
String[] appArgs = getMainArgs(context, flowFile);
String extraJars = getExtraJars(context, flowFile);
String yarnQueue = context.getProperty(YARN_QUEUE).evaluateAttributeExpressions(flowFile).getValue();
String sparkMaster = context.getProperty(SPARK_MASTER).evaluateAttributeExpressions(flowFile).getValue().trim();
String sparkYarnDeployMode = context.getProperty(SPARK_YARN_DEPLOY_MODE).evaluateAttributeExpressions(flowFile).getValue();
String driverMemory = context.getProperty(DRIVER_MEMORY).evaluateAttributeExpressions(flowFile).getValue();
String executorMemory = context.getProperty(EXECUTOR_MEMORY).evaluateAttributeExpressions(flowFile).getValue();
String numberOfExecutors = context.getProperty(NUMBER_EXECUTORS).evaluateAttributeExpressions(flowFile).getValue();
String sparkApplicationName = context.getProperty(SPARK_APPLICATION_NAME).evaluateAttributeExpressions(flowFile).getValue();
String executorCores = context.getProperty(EXECUTOR_CORES).evaluateAttributeExpressions(flowFile).getValue();
String networkTimeout = context.getProperty(NETWORK_TIMEOUT).evaluateAttributeExpressions(flowFile).getValue();
String principal = context.getProperty(kerberosPrincipal).getValue();
String keyTab = context.getProperty(kerberosKeyTab).getValue();
String hadoopConfigurationResources = context.getProperty(HADOOP_CONFIGURATION_RESOURCES).getValue();
String sparkConfs = context.getProperty(SPARK_CONFS).evaluateAttributeExpressions(flowFile).getValue();
String extraFiles = context.getProperty(EXTRA_SPARK_FILES).evaluateAttributeExpressions(flowFile).getValue();
Integer sparkProcessTimeout = context.getProperty(PROCESS_TIMEOUT).evaluateAttributeExpressions(flowFile).asTimePeriod(TimeUnit.SECONDS).intValue();
String datasourceIds = context.getProperty(DATASOURCES).evaluateAttributeExpressions(flowFile).getValue();
String catalogDataSourceIds = context.getProperty(CATALOG_DATASOURCES).evaluateAttributeExpressions(flowFile).getValue();
String dataSetIds = context.getProperty(DATASETS).evaluateAttributeExpressions(flowFile).getValue();
MetadataProviderService metadataService = context.getProperty(METADATA_SERVICE).asControllerService(MetadataProviderService.class);
final List<String> extraJarPaths = getExtraJarPaths(extraJars);
// If all 3 fields are filled out then assume kerberos is enabled, and user should be authenticated
boolean isAuthenticated = !StringUtils.isEmpty(principal) && !StringUtils.isEmpty(keyTab) && !StringUtils.isEmpty(hadoopConfigurationResources);
try {
if (isAuthenticated && isSecurityEnabled(hadoopConfigurationResources)) {
logger.info("Security is enabled");
if (principal.equals("") && keyTab.equals("")) {
logger.error("Kerberos Principal and Kerberos KeyTab information missing in Kerboeros enabled cluster. {} ", new Object[] { flowFile });
session.transfer(flowFile, REL_FAILURE);
return;
}
logger.info("User authentication initiated");
boolean authenticationStatus = new ApplySecurityPolicy().validateUserWithKerberos(logger, hadoopConfigurationResources, principal, keyTab);
if (authenticationStatus) {
logger.info("User authenticated successfully.");
} else {
logger.error("User authentication failed. {} ", new Object[] { flowFile });
session.transfer(flowFile, REL_FAILURE);
return;
}
}
} catch (IOException e1) {
logger.error("Unknown exception occurred while authenticating user : {} and flow file: {}", new Object[] { e1.getMessage(), flowFile });
session.transfer(flowFile, REL_FAILURE);
return;
} catch (Exception unknownException) {
logger.error("Unknown exception occurred while validating user : {}. {} ", new Object[] { unknownException.getMessage(), flowFile });
session.transfer(flowFile, REL_FAILURE);
return;
}
String sparkHome = context.getProperty(SPARK_HOME).evaluateAttributeExpressions(flowFile).getValue();
// Build environment
final Map<String, String> env = getDatasources(session, flowFile, PROVENANCE_JOB_STATUS_KEY, datasourceIds, dataSetIds, catalogDataSourceIds, metadataService, extraJarPaths);
if (env != null) {
StringBuilder datasourceSummary = new StringBuilder();
if (env.containsKey("DATASETS")) {
final int count = StringUtils.countMatches("DATASETS", ',') + 1;
datasourceSummary.append(count).append(" datasets");
}
if (env.containsKey("DATASOURCES")) {
final int count = StringUtils.countMatches("DATASOURCES", ',') + 1;
(datasourceSummary.length() > 0 ? datasourceSummary.append("; ") : datasourceSummary).append(count).append(" legacy datasources");
}
if (env.containsKey("CATALOG_DATASOURCES")) {
final int count = StringUtils.countMatches("CATALOG_DATASOURCES", ',') + 1;
(datasourceSummary.length() > 0 ? datasourceSummary.append("; ") : datasourceSummary).append(count).append(" catalog datasources");
}
String summaryString = datasourceSummary.toString();
if (StringUtils.isNotBlank(summaryString)) {
flowFile = session.putAttribute(flowFile, "Data source usage", summaryString);
}
} else {
return;
}
addEncryptionSettings(env);
/* Launch the spark job as a child process */
SparkLauncher launcher = new SparkLauncher(env).setAppResource(appJar).setMainClass(mainClass).setMaster(sparkMaster).setConf(SparkLauncher.DRIVER_MEMORY, driverMemory).setConf(SPARK_NUM_EXECUTORS, numberOfExecutors).setConf(SparkLauncher.EXECUTOR_MEMORY, executorMemory).setConf(SparkLauncher.EXECUTOR_CORES, executorCores).setConf(SPARK_NETWORK_TIMEOUT_CONFIG_NAME, networkTimeout).setSparkHome(sparkHome).setAppName(sparkApplicationName);
OptionalSparkConfigurator optionalSparkConf = new OptionalSparkConfigurator(launcher).setDeployMode(sparkMaster, sparkYarnDeployMode).setAuthentication(isAuthenticated, keyTab, principal).addAppArgs(appArgs).addSparkArg(sparkConfs).addExtraJars(extraJarPaths).setYarnQueue(yarnQueue).setExtraFiles(extraFiles);
Process spark = optionalSparkConf.getLaucnher().launch();
/* Read/clear the process input stream */
InputStreamReaderRunnable inputStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO, logger, spark.getInputStream());
Thread inputThread = new Thread(inputStreamReaderRunnable, "stream input");
inputThread.start();
/* Read/clear the process error stream */
InputStreamReaderRunnable errorStreamReaderRunnable = new InputStreamReaderRunnable(LogLevel.INFO, logger, spark.getErrorStream());
Thread errorThread = new Thread(errorStreamReaderRunnable, "stream error");
errorThread.start();
logger.info("Waiting for Spark job to complete");
/* Wait for job completion */
boolean completed = spark.waitFor(sparkProcessTimeout, TimeUnit.SECONDS);
if (!completed) {
spark.destroyForcibly();
getLog().error("Spark process timed out after {} seconds using flow file: {} ", new Object[] { sparkProcessTimeout, flowFile });
session.transfer(flowFile, REL_FAILURE);
return;
}
int exitCode = spark.exitValue();
flowFile = session.putAttribute(flowFile, PROVENANCE_SPARK_EXIT_CODE_KEY, Integer.toString(exitCode));
if (exitCode != 0) {
logger.error("ExecuteSparkJob for {} and flowfile: {} completed with failed status {} ", new Object[] { context.getName(), flowFile, exitCode });
flowFile = session.putAttribute(flowFile, PROVENANCE_JOB_STATUS_KEY, "Failed");
session.transfer(flowFile, REL_FAILURE);
} else {
logger.info("ExecuteSparkJob for {} and flowfile: {} completed with success status {} ", new Object[] { context.getName(), flowFile, exitCode });
flowFile = session.putAttribute(flowFile, PROVENANCE_JOB_STATUS_KEY, "Success");
session.transfer(flowFile, REL_SUCCESS);
}
} catch (final Exception e) {
logger.error("Unable to execute Spark job {},{}", new Object[] { flowFile, e.getMessage() }, e);
flowFile = session.putAttribute(flowFile, PROVENANCE_JOB_STATUS_KEY, "Failed With Exception");
flowFile = session.putAttribute(flowFile, "Spark Exception:", e.getMessage());
session.transfer(flowFile, REL_FAILURE);
}
}
Aggregations