use of com.homeaway.datapullclient.config.EMRProperties in project datapull by homeaway.
the class DataPullRequestProcessor method runDataPull.
private void runDataPull(String json, boolean isStart, boolean validateJson) throws ProcessingException {
String originalInputJson = json;
json = extractUserJsonFromS3IfProvided(json, isStart);
final EMRProperties emrProperties = this.config.getEmrProperties();
if (log.isDebugEnabled())
log.debug("runDataPull -> json = " + json + " isStart = " + isStart);
try {
if (validateJson) {
json = validateAndEnrich(json);
}
log.info("Running datapull for json : " + json + " cron expression = " + isStart + "env =" + env);
final ObjectNode node = new ObjectMapper().readValue(json, ObjectNode.class);
List<Map.Entry<String, JsonNode>> result = new LinkedList<Map.Entry<String, JsonNode>>();
Iterator<Map.Entry<String, JsonNode>> nodes = node.fields();
while (nodes.hasNext()) {
result.add(nodes.next());
}
JsonNode clusterNode = result.stream().filter(y -> y.getKey().equalsIgnoreCase("cluster")).map(x -> x.getValue()).findAny().get();
JsonNode migrationsNode = result.stream().filter(y -> y.getKey().equalsIgnoreCase("migrations")).map(x -> x.getValue()).findAny().get();
if (clusterNode == null)
throw new ProcessingException("Invalid Json!!! Cluster properties cannot be null");
String creator = node.has(CREATOR) ? node.findValue(CREATOR).asText() : "";
ObjectMapper mapper = new ObjectMapper();
ClusterProperties reader = mapper.treeToValue(clusterNode, ClusterProperties.class);
Migration[] myObjects = mapper.treeToValue(migrationsNode, Migration[].class);
String cronExp = Objects.toString(reader.getCronExpression(), "");
if (!cronExp.isEmpty())
cronExp = validateAndProcessCronExpression(cronExp);
String pipeline = Objects.toString(reader.getPipelineName(), UUID.randomUUID().toString());
String pipelineEnv = Objects.toString(reader.getAwsEnv(), env);
DataPullProperties dataPullProperties = config.getDataPullProperties();
String applicationHistoryFolder = dataPullProperties.getApplicationHistoryFolder();
String s3RepositoryBucketName = dataPullProperties.getS3BucketName();
String jobName = pipelineEnv + PIPELINE_NAME_DELIMITER + EMR + PIPELINE_NAME_DELIMITER + pipeline + PIPELINE_NAME_DELIMITER + PIPELINE_NAME_SUFFIX;
String applicationHistoryFolderPath = applicationHistoryFolder == null || applicationHistoryFolder.isEmpty() ? s3RepositoryBucketName + "/" + DATAPULL_HISTORY_FOLDER : applicationHistoryFolder;
String bootstrapFilePath = s3RepositoryBucketName + "/" + BOOTSTRAP_FOLDER;
String filePath = applicationHistoryFolderPath + "/" + jobName;
String bootstrapFile = jobName + ".sh";
String jksFilePath = bootstrapFilePath + "/" + bootstrapFile;
String bootstrapActionStringFromUser = Objects.toString(reader.getBootstrapactionstring(), "");
String defaultBootstrapString = emrProperties.getDefaultBootstrapString();
Boolean haveBootstrapAction = createBootstrapScript(myObjects, bootstrapFile, bootstrapFilePath, bootstrapActionStringFromUser, defaultBootstrapString);
DataPullTask task = createDataPullTask(filePath, jksFilePath, reader, jobName, creator, node.path("sparkjarfile").asText(), haveBootstrapAction);
if (!isStart) {
json = originalInputJson.equals(json) ? json : originalInputJson;
saveConfig(applicationHistoryFolderPath, jobName + ".json", json);
}
if (!isStart && tasksMap.containsKey(jobName))
cancelExistingTask(jobName);
if (!(isStart && cronExp.isEmpty())) {
Future<?> future = !cronExp.isEmpty() ? scheduler.schedule(task, new CronTrigger(cronExp)) : scheduler.schedule(task, new Date(System.currentTimeMillis() + 1 * 1000));
tasksMap.put(jobName, future);
}
} catch (IOException e) {
throw new ProcessingException("exception while starting datapull " + e.getLocalizedMessage());
}
if (log.isDebugEnabled())
log.debug("runDataPull <- return");
}
use of com.homeaway.datapullclient.config.EMRProperties in project datapull by homeaway.
the class DataPullTask method addTagsToEMRCluster.
private void addTagsToEMRCluster() {
final EMRProperties emrProperties = this.config.getEmrProperties();
final Map<String, String> tags = this.config.getEmrProperties().getTags();
this.addTags(tags);
}
use of com.homeaway.datapullclient.config.EMRProperties in project datapull by homeaway.
the class DataPullTask method runTaskInNewCluster.
private RunJobFlowResult runTaskInNewCluster(final AmazonElasticMapReduce emr, final String logPath, final String jarPath, final String sparkSubmitParams, final Boolean haveBootstrapAction) {
HadoopJarStepConfig runExampleConfig = null;
if (sparkSubmitParams != null && !sparkSubmitParams.isEmpty()) {
final List<String> sparkSubmitParamsList = this.prepareSparkSubmitParams(sparkSubmitParams);
runExampleConfig = new HadoopJarStepConfig().withJar("command-runner.jar").withArgs(sparkSubmitParamsList);
} else {
runExampleConfig = new HadoopJarStepConfig().withJar("command-runner.jar").withArgs("spark-submit", "--conf", "spark.default.parallelism=3", "--conf", "spark.storage.blockManagerSlaveTimeoutMs=1200s", "--conf", "spark.executor.heartbeatInterval=900s", "--conf", "spark.driver.extraJavaOptions=-Djavax.net.ssl.trustStore=/etc/pki/java/cacerts/ -Djavax.net.ssl.trustStorePassword=changeit", "--conf", "spark.executor.extraJavaOptions=-Djavax.net.ssl.trustStore=/etc/pki/java/cacerts/ -Djavax.net.ssl.trustStorePassword=changeit", "--packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4,org.apache.spark:spark-avro_2.11:2.4.4", "--class", DataPullTask.MAIN_CLASS, jarPath, String.format(DataPullTask.JSON_WITH_INPUT_FILE_PATH, this.jsonS3Path));
}
final StepConfig customExampleStep = new StepConfig().withName(this.taskId).withActionOnFailure("CONTINUE").withHadoopJarStep(runExampleConfig);
final Application spark = new Application().withName("Spark");
final Application hive = new Application().withName("Hive");
final EMRProperties emrProperties = this.config.getEmrProperties();
final int instanceCount = emrProperties.getInstanceCount();
final String masterType = emrProperties.getMasterType();
final DataPullProperties datapullProperties = this.config.getDataPullProperties();
final String applicationSubnet = Objects.toString(this.clusterProperties.getSubnetId(), datapullProperties.getApplicationSubnet1());
final int count = Integer.valueOf(Objects.toString(this.clusterProperties.getEmrInstanceCount(), Integer.toString(instanceCount)));
final JobFlowInstancesConfig jobConfig = new JobFlowInstancesConfig().withEc2KeyName(// passing invalid key will make the process terminate
Objects.toString(this.clusterProperties.getEc2KeyName(), emrProperties.getEc2KeyName())).withEc2SubnetId(applicationSubnet).withMasterInstanceType(Objects.toString(this.clusterProperties.getMasterInstanceType(), masterType)).withInstanceCount(count).withKeepJobFlowAliveWhenNoSteps(!Boolean.valueOf(Objects.toString(this.clusterProperties.getTerminateClusterAfterExecution(), "true")));
final String masterSG = emrProperties.getEmrSecurityGroupMaster();
final String slaveSG = emrProperties.getEmrSecurityGroupSlave();
final String serviceAccesss = emrProperties.getEmrSecurityGroupServiceAccess();
final String masterSecurityGroup = Objects.toString(this.clusterProperties.getMasterSecurityGroup(), masterSG != null ? masterSG : "");
final String slaveSecurityGroup = Objects.toString(this.clusterProperties.getSlaveSecurityGroup(), slaveSG != null ? slaveSG : "");
final String serviceAccessSecurityGroup = Objects.toString(this.clusterProperties.getServiceAccessSecurityGroup(), serviceAccesss != null ? serviceAccesss : "");
if (!masterSecurityGroup.isEmpty()) {
jobConfig.withEmrManagedMasterSecurityGroup(masterSecurityGroup);
}
if (!slaveSecurityGroup.isEmpty()) {
jobConfig.withEmrManagedSlaveSecurityGroup(slaveSecurityGroup);
}
if (!serviceAccessSecurityGroup.isEmpty()) {
jobConfig.withServiceAccessSecurityGroup(serviceAccessSecurityGroup);
}
final String slaveType = emrProperties.getSlaveType();
if (count > 1) {
jobConfig.withSlaveInstanceType(Objects.toString(this.clusterProperties.getSlaveInstanceType(), slaveType));
}
this.addTagsToEMRCluster();
final Map<String, String> sparkProperties = new HashMap<>();
sparkProperties.put("maximizeResourceAllocation", "true");
final String emrReleaseVersion = emrProperties.getEmrRelease();
final String serviceRole = emrProperties.getServiceRole();
final String jobFlowRole = emrProperties.getJobFlowRole();
final String emrSecurityConfiguration = Objects.toString(clusterProperties.getEmr_security_configuration(), "");
Map<String, String> emrfsProperties = new HashMap<String, String>();
emrfsProperties.put("fs.s3.canned.acl", "BucketOwnerFullControl");
Configuration myEmrfsConfig = new Configuration().withClassification("emrfs-site").withProperties(emrfsProperties);
Map<String, String> sparkHiveProperties = emrProperties.getSparkHiveProperties().entrySet().stream().filter(keyVal -> !keyVal.getValue().isEmpty()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
sparkHiveProperties.putAll(this.clusterProperties.getSparkHiveProperties());
Map<String, String> hiveProperties = emrProperties.getHiveProperties().entrySet().stream().filter(keyVal -> !keyVal.getValue().isEmpty()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
hiveProperties.putAll(this.clusterProperties.getHiveProperties());
Configuration sparkHiveConfig = new Configuration().withClassification("spark-hive-site").withProperties(sparkHiveProperties);
Configuration hiveConfig = new Configuration().withClassification("hive-site").withProperties(hiveProperties);
final RunJobFlowRequest request = new RunJobFlowRequest().withName(this.taskId).withReleaseLabel(Objects.toString(this.clusterProperties.getEmrReleaseVersion(), emrReleaseVersion)).withSteps(customExampleStep).withApplications(spark, hive).withLogUri(logPath).withServiceRole(Objects.toString(this.clusterProperties.getEmrServiceRole(), serviceRole)).withJobFlowRole(// addAdditionalInfoEntry("maximizeResourceAllocation", "true")
Objects.toString(this.clusterProperties.getInstanceProfile(), jobFlowRole)).withVisibleToAllUsers(true).withTags(this.emrTags.values()).withConfigurations(new Configuration().withClassification("spark").withProperties(sparkProperties), myEmrfsConfig).withInstances(jobConfig);
if (!hiveProperties.isEmpty()) {
request.withConfigurations(hiveConfig);
}
if (!sparkHiveProperties.isEmpty()) {
request.withConfigurations(sparkHiveConfig);
}
if (!emrSecurityConfiguration.isEmpty()) {
request.withSecurityConfiguration(emrSecurityConfiguration);
}
if (haveBootstrapAction) {
final BootstrapActionConfig bsConfig = new BootstrapActionConfig();
bsConfig.setName("bootstrapaction");
bsConfig.setScriptBootstrapAction(new ScriptBootstrapActionConfig().withPath("s3://" + this.jksS3Path));
request.withBootstrapActions(bsConfig);
}
return emr.runJobFlow(request);
}
use of com.homeaway.datapullclient.config.EMRProperties in project datapull by homeaway.
the class DataPullTask method toString.
@Override
public String toString() {
final DataPullProperties dataPullProperties = this.config.getDataPullProperties();
final EMRProperties emrProperties = this.config.getEmrProperties();
return "DataPullTask{" + "taskId='" + this.taskId + '\'' + ", jsonS3Path='" + this.jsonS3Path + '\'' + ", logFilePath='" + dataPullProperties.getLogFilePath() + '\'' + ", s3RepositoryBucketName='" + dataPullProperties.getS3BucketName() + '\'' + ", s3JarPath='" + this.s3JarPath + '\'' + ", instanceCount=" + emrProperties.getInstanceCount() + ", masterType='" + emrProperties.getMasterType() + '\'' + ", slaveType='" + emrProperties.getSlaveType() + '\'' + ", serviceRole='" + emrProperties.getServiceRole() + '\'' + ", jobFlowRole='" + emrProperties.getJobFlowRole() + '\'' + ", emrReleaseVersion='" + emrProperties.getEmrRelease() + '\'' + ", config=" + this.config + '}';
}
Aggregations