Search in sources :

Example 1 with EMRProperties

use of com.homeaway.datapullclient.config.EMRProperties in project datapull by homeaway.

the class DataPullRequestProcessor method runDataPull.

private void runDataPull(String json, boolean isStart, boolean validateJson) throws ProcessingException {
    String originalInputJson = json;
    json = extractUserJsonFromS3IfProvided(json, isStart);
    final EMRProperties emrProperties = this.config.getEmrProperties();
    if (log.isDebugEnabled())
        log.debug("runDataPull -> json = " + json + " isStart = " + isStart);
    try {
        if (validateJson) {
            json = validateAndEnrich(json);
        }
        log.info("Running datapull for json : " + json + " cron expression = " + isStart + "env =" + env);
        final ObjectNode node = new ObjectMapper().readValue(json, ObjectNode.class);
        List<Map.Entry<String, JsonNode>> result = new LinkedList<Map.Entry<String, JsonNode>>();
        Iterator<Map.Entry<String, JsonNode>> nodes = node.fields();
        while (nodes.hasNext()) {
            result.add(nodes.next());
        }
        JsonNode clusterNode = result.stream().filter(y -> y.getKey().equalsIgnoreCase("cluster")).map(x -> x.getValue()).findAny().get();
        JsonNode migrationsNode = result.stream().filter(y -> y.getKey().equalsIgnoreCase("migrations")).map(x -> x.getValue()).findAny().get();
        if (clusterNode == null)
            throw new ProcessingException("Invalid Json!!! Cluster properties cannot be null");
        String creator = node.has(CREATOR) ? node.findValue(CREATOR).asText() : "";
        ObjectMapper mapper = new ObjectMapper();
        ClusterProperties reader = mapper.treeToValue(clusterNode, ClusterProperties.class);
        Migration[] myObjects = mapper.treeToValue(migrationsNode, Migration[].class);
        String cronExp = Objects.toString(reader.getCronExpression(), "");
        if (!cronExp.isEmpty())
            cronExp = validateAndProcessCronExpression(cronExp);
        String pipeline = Objects.toString(reader.getPipelineName(), UUID.randomUUID().toString());
        String pipelineEnv = Objects.toString(reader.getAwsEnv(), env);
        DataPullProperties dataPullProperties = config.getDataPullProperties();
        String applicationHistoryFolder = dataPullProperties.getApplicationHistoryFolder();
        String s3RepositoryBucketName = dataPullProperties.getS3BucketName();
        String jobName = pipelineEnv + PIPELINE_NAME_DELIMITER + EMR + PIPELINE_NAME_DELIMITER + pipeline + PIPELINE_NAME_DELIMITER + PIPELINE_NAME_SUFFIX;
        String applicationHistoryFolderPath = applicationHistoryFolder == null || applicationHistoryFolder.isEmpty() ? s3RepositoryBucketName + "/" + DATAPULL_HISTORY_FOLDER : applicationHistoryFolder;
        String bootstrapFilePath = s3RepositoryBucketName + "/" + BOOTSTRAP_FOLDER;
        String filePath = applicationHistoryFolderPath + "/" + jobName;
        String bootstrapFile = jobName + ".sh";
        String jksFilePath = bootstrapFilePath + "/" + bootstrapFile;
        String bootstrapActionStringFromUser = Objects.toString(reader.getBootstrapactionstring(), "");
        String defaultBootstrapString = emrProperties.getDefaultBootstrapString();
        Boolean haveBootstrapAction = createBootstrapScript(myObjects, bootstrapFile, bootstrapFilePath, bootstrapActionStringFromUser, defaultBootstrapString);
        DataPullTask task = createDataPullTask(filePath, jksFilePath, reader, jobName, creator, node.path("sparkjarfile").asText(), haveBootstrapAction);
        if (!isStart) {
            json = originalInputJson.equals(json) ? json : originalInputJson;
            saveConfig(applicationHistoryFolderPath, jobName + ".json", json);
        }
        if (!isStart && tasksMap.containsKey(jobName))
            cancelExistingTask(jobName);
        if (!(isStart && cronExp.isEmpty())) {
            Future<?> future = !cronExp.isEmpty() ? scheduler.schedule(task, new CronTrigger(cronExp)) : scheduler.schedule(task, new Date(System.currentTimeMillis() + 1 * 1000));
            tasksMap.put(jobName, future);
        }
    } catch (IOException e) {
        throw new ProcessingException("exception while starting datapull " + e.getLocalizedMessage());
    }
    if (log.isDebugEnabled())
        log.debug("runDataPull <- return");
}
Also used : java.util(java.util) DataPullClientService(com.homeaway.datapullclient.service.DataPullClientService) DataPullContextHolder(com.homeaway.datapullclient.config.DataPullContextHolder) Autowired(org.springframework.beans.factory.annotation.Autowired) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) ProcessingException(com.homeaway.datapullclient.exception.ProcessingException) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) EMRProperties(com.homeaway.datapullclient.config.EMRProperties) Value(org.springframework.beans.factory.annotation.Value) PathMatchingResourcePatternResolver(org.springframework.core.io.support.PathMatchingResourcePatternResolver) Future(java.util.concurrent.Future) JSONObject(org.json.JSONObject) ByteArrayInputStream(java.io.ByteArrayInputStream) JsonInputFile(com.homeaway.datapullclient.input.JsonInputFile) Service(org.springframework.stereotype.Service) AmazonS3(com.amazonaws.services.s3.AmazonS3) JsonNode(com.fasterxml.jackson.databind.JsonNode) ThreadPoolTaskScheduler(org.springframework.scheduling.concurrent.ThreadPoolTaskScheduler) SchemaLoader(org.everit.json.schema.loader.SchemaLoader) Resource(org.springframework.core.io.Resource) Migration(com.homeaway.datapullclient.input.Migration) ValidationException(org.everit.json.schema.ValidationException) Source(com.homeaway.datapullclient.input.Source) DataPullContext(com.homeaway.datapullclient.config.DataPullContext) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JSONTokener(org.json.JSONTokener) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) IOException(java.io.IOException) com.amazonaws.services.s3.model(com.amazonaws.services.s3.model) ClusterProperties(com.homeaway.datapullclient.input.ClusterProperties) InputStreamReader(java.io.InputStreamReader) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CronTrigger(org.springframework.scheduling.support.CronTrigger) InvalidPointedJsonException(com.homeaway.datapullclient.exception.InvalidPointedJsonException) Slf4j(lombok.extern.slf4j.Slf4j) PostConstruct(javax.annotation.PostConstruct) DataPullClientConfig(com.homeaway.datapullclient.config.DataPullClientConfig) Schema(org.everit.json.schema.Schema) ResourcePatternResolver(org.springframework.core.io.support.ResourcePatternResolver) BufferedReader(java.io.BufferedReader) CronTrigger(org.springframework.scheduling.support.CronTrigger) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) Migration(com.homeaway.datapullclient.input.Migration) JsonNode(com.fasterxml.jackson.databind.JsonNode) IOException(java.io.IOException) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) ClusterProperties(com.homeaway.datapullclient.input.ClusterProperties) EMRProperties(com.homeaway.datapullclient.config.EMRProperties) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) ProcessingException(com.homeaway.datapullclient.exception.ProcessingException)

Example 2 with EMRProperties

use of com.homeaway.datapullclient.config.EMRProperties in project datapull by homeaway.

the class DataPullTask method addTagsToEMRCluster.

private void addTagsToEMRCluster() {
    final EMRProperties emrProperties = this.config.getEmrProperties();
    final Map<String, String> tags = this.config.getEmrProperties().getTags();
    this.addTags(tags);
}
Also used : EMRProperties(com.homeaway.datapullclient.config.EMRProperties)

Example 3 with EMRProperties

use of com.homeaway.datapullclient.config.EMRProperties in project datapull by homeaway.

the class DataPullTask method runTaskInNewCluster.

private RunJobFlowResult runTaskInNewCluster(final AmazonElasticMapReduce emr, final String logPath, final String jarPath, final String sparkSubmitParams, final Boolean haveBootstrapAction) {
    HadoopJarStepConfig runExampleConfig = null;
    if (sparkSubmitParams != null && !sparkSubmitParams.isEmpty()) {
        final List<String> sparkSubmitParamsList = this.prepareSparkSubmitParams(sparkSubmitParams);
        runExampleConfig = new HadoopJarStepConfig().withJar("command-runner.jar").withArgs(sparkSubmitParamsList);
    } else {
        runExampleConfig = new HadoopJarStepConfig().withJar("command-runner.jar").withArgs("spark-submit", "--conf", "spark.default.parallelism=3", "--conf", "spark.storage.blockManagerSlaveTimeoutMs=1200s", "--conf", "spark.executor.heartbeatInterval=900s", "--conf", "spark.driver.extraJavaOptions=-Djavax.net.ssl.trustStore=/etc/pki/java/cacerts/ -Djavax.net.ssl.trustStorePassword=changeit", "--conf", "spark.executor.extraJavaOptions=-Djavax.net.ssl.trustStore=/etc/pki/java/cacerts/ -Djavax.net.ssl.trustStorePassword=changeit", "--packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4,org.apache.spark:spark-avro_2.11:2.4.4", "--class", DataPullTask.MAIN_CLASS, jarPath, String.format(DataPullTask.JSON_WITH_INPUT_FILE_PATH, this.jsonS3Path));
    }
    final StepConfig customExampleStep = new StepConfig().withName(this.taskId).withActionOnFailure("CONTINUE").withHadoopJarStep(runExampleConfig);
    final Application spark = new Application().withName("Spark");
    final Application hive = new Application().withName("Hive");
    final EMRProperties emrProperties = this.config.getEmrProperties();
    final int instanceCount = emrProperties.getInstanceCount();
    final String masterType = emrProperties.getMasterType();
    final DataPullProperties datapullProperties = this.config.getDataPullProperties();
    final String applicationSubnet = Objects.toString(this.clusterProperties.getSubnetId(), datapullProperties.getApplicationSubnet1());
    final int count = Integer.valueOf(Objects.toString(this.clusterProperties.getEmrInstanceCount(), Integer.toString(instanceCount)));
    final JobFlowInstancesConfig jobConfig = new JobFlowInstancesConfig().withEc2KeyName(// passing invalid key will make the process terminate
    Objects.toString(this.clusterProperties.getEc2KeyName(), emrProperties.getEc2KeyName())).withEc2SubnetId(applicationSubnet).withMasterInstanceType(Objects.toString(this.clusterProperties.getMasterInstanceType(), masterType)).withInstanceCount(count).withKeepJobFlowAliveWhenNoSteps(!Boolean.valueOf(Objects.toString(this.clusterProperties.getTerminateClusterAfterExecution(), "true")));
    final String masterSG = emrProperties.getEmrSecurityGroupMaster();
    final String slaveSG = emrProperties.getEmrSecurityGroupSlave();
    final String serviceAccesss = emrProperties.getEmrSecurityGroupServiceAccess();
    final String masterSecurityGroup = Objects.toString(this.clusterProperties.getMasterSecurityGroup(), masterSG != null ? masterSG : "");
    final String slaveSecurityGroup = Objects.toString(this.clusterProperties.getSlaveSecurityGroup(), slaveSG != null ? slaveSG : "");
    final String serviceAccessSecurityGroup = Objects.toString(this.clusterProperties.getServiceAccessSecurityGroup(), serviceAccesss != null ? serviceAccesss : "");
    if (!masterSecurityGroup.isEmpty()) {
        jobConfig.withEmrManagedMasterSecurityGroup(masterSecurityGroup);
    }
    if (!slaveSecurityGroup.isEmpty()) {
        jobConfig.withEmrManagedSlaveSecurityGroup(slaveSecurityGroup);
    }
    if (!serviceAccessSecurityGroup.isEmpty()) {
        jobConfig.withServiceAccessSecurityGroup(serviceAccessSecurityGroup);
    }
    final String slaveType = emrProperties.getSlaveType();
    if (count > 1) {
        jobConfig.withSlaveInstanceType(Objects.toString(this.clusterProperties.getSlaveInstanceType(), slaveType));
    }
    this.addTagsToEMRCluster();
    final Map<String, String> sparkProperties = new HashMap<>();
    sparkProperties.put("maximizeResourceAllocation", "true");
    final String emrReleaseVersion = emrProperties.getEmrRelease();
    final String serviceRole = emrProperties.getServiceRole();
    final String jobFlowRole = emrProperties.getJobFlowRole();
    final String emrSecurityConfiguration = Objects.toString(clusterProperties.getEmr_security_configuration(), "");
    Map<String, String> emrfsProperties = new HashMap<String, String>();
    emrfsProperties.put("fs.s3.canned.acl", "BucketOwnerFullControl");
    Configuration myEmrfsConfig = new Configuration().withClassification("emrfs-site").withProperties(emrfsProperties);
    Map<String, String> sparkHiveProperties = emrProperties.getSparkHiveProperties().entrySet().stream().filter(keyVal -> !keyVal.getValue().isEmpty()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    sparkHiveProperties.putAll(this.clusterProperties.getSparkHiveProperties());
    Map<String, String> hiveProperties = emrProperties.getHiveProperties().entrySet().stream().filter(keyVal -> !keyVal.getValue().isEmpty()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    hiveProperties.putAll(this.clusterProperties.getHiveProperties());
    Configuration sparkHiveConfig = new Configuration().withClassification("spark-hive-site").withProperties(sparkHiveProperties);
    Configuration hiveConfig = new Configuration().withClassification("hive-site").withProperties(hiveProperties);
    final RunJobFlowRequest request = new RunJobFlowRequest().withName(this.taskId).withReleaseLabel(Objects.toString(this.clusterProperties.getEmrReleaseVersion(), emrReleaseVersion)).withSteps(customExampleStep).withApplications(spark, hive).withLogUri(logPath).withServiceRole(Objects.toString(this.clusterProperties.getEmrServiceRole(), serviceRole)).withJobFlowRole(// addAdditionalInfoEntry("maximizeResourceAllocation", "true")
    Objects.toString(this.clusterProperties.getInstanceProfile(), jobFlowRole)).withVisibleToAllUsers(true).withTags(this.emrTags.values()).withConfigurations(new Configuration().withClassification("spark").withProperties(sparkProperties), myEmrfsConfig).withInstances(jobConfig);
    if (!hiveProperties.isEmpty()) {
        request.withConfigurations(hiveConfig);
    }
    if (!sparkHiveProperties.isEmpty()) {
        request.withConfigurations(sparkHiveConfig);
    }
    if (!emrSecurityConfiguration.isEmpty()) {
        request.withSecurityConfiguration(emrSecurityConfiguration);
    }
    if (haveBootstrapAction) {
        final BootstrapActionConfig bsConfig = new BootstrapActionConfig();
        bsConfig.setName("bootstrapaction");
        bsConfig.setScriptBootstrapAction(new ScriptBootstrapActionConfig().withPath("s3://" + this.jksS3Path));
        request.withBootstrapActions(bsConfig);
    }
    return emr.runJobFlow(request);
}
Also used : AmazonElasticMapReduce(com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduce) Slf4j(lombok.extern.slf4j.Slf4j) java.util(java.util) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) DataPullClientConfig(com.homeaway.datapullclient.config.DataPullClientConfig) Autowired(org.springframework.beans.factory.annotation.Autowired) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) ClusterProperties(com.homeaway.datapullclient.input.ClusterProperties) com.amazonaws.services.elasticmapreduce.model(com.amazonaws.services.elasticmapreduce.model) Collectors(java.util.stream.Collectors) EMRProperties(com.homeaway.datapullclient.config.EMRProperties) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) EMRProperties(com.homeaway.datapullclient.config.EMRProperties)

Example 4 with EMRProperties

use of com.homeaway.datapullclient.config.EMRProperties in project datapull by homeaway.

the class DataPullTask method toString.

@Override
public String toString() {
    final DataPullProperties dataPullProperties = this.config.getDataPullProperties();
    final EMRProperties emrProperties = this.config.getEmrProperties();
    return "DataPullTask{" + "taskId='" + this.taskId + '\'' + ", jsonS3Path='" + this.jsonS3Path + '\'' + ", logFilePath='" + dataPullProperties.getLogFilePath() + '\'' + ", s3RepositoryBucketName='" + dataPullProperties.getS3BucketName() + '\'' + ", s3JarPath='" + this.s3JarPath + '\'' + ", instanceCount=" + emrProperties.getInstanceCount() + ", masterType='" + emrProperties.getMasterType() + '\'' + ", slaveType='" + emrProperties.getSlaveType() + '\'' + ", serviceRole='" + emrProperties.getServiceRole() + '\'' + ", jobFlowRole='" + emrProperties.getJobFlowRole() + '\'' + ", emrReleaseVersion='" + emrProperties.getEmrRelease() + '\'' + ", config=" + this.config + '}';
}
Also used : DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) EMRProperties(com.homeaway.datapullclient.config.EMRProperties)

Aggregations

EMRProperties (com.homeaway.datapullclient.config.EMRProperties)4 DataPullProperties (com.homeaway.datapullclient.config.DataPullProperties)3 DataPullClientConfig (com.homeaway.datapullclient.config.DataPullClientConfig)2 ClusterProperties (com.homeaway.datapullclient.input.ClusterProperties)2 java.util (java.util)2 Collectors (java.util.stream.Collectors)2 Slf4j (lombok.extern.slf4j.Slf4j)2 Autowired (org.springframework.beans.factory.annotation.Autowired)2 AmazonElasticMapReduce (com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduce)1 com.amazonaws.services.elasticmapreduce.model (com.amazonaws.services.elasticmapreduce.model)1 AmazonS3 (com.amazonaws.services.s3.AmazonS3)1 com.amazonaws.services.s3.model (com.amazonaws.services.s3.model)1 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 ObjectNode (com.fasterxml.jackson.databind.node.ObjectNode)1 DataPullContext (com.homeaway.datapullclient.config.DataPullContext)1 DataPullContextHolder (com.homeaway.datapullclient.config.DataPullContextHolder)1 InvalidPointedJsonException (com.homeaway.datapullclient.exception.InvalidPointedJsonException)1 ProcessingException (com.homeaway.datapullclient.exception.ProcessingException)1 JsonInputFile (com.homeaway.datapullclient.input.JsonInputFile)1