Search in sources :

Example 1 with DataPullProperties

use of com.homeaway.datapullclient.config.DataPullProperties in project datapull by homeaway.

the class DataPullRequestProcessor method runDataPull.

private void runDataPull(String json, boolean isStart, boolean validateJson) throws ProcessingException {
    String originalInputJson = json;
    json = extractUserJsonFromS3IfProvided(json, isStart);
    final EMRProperties emrProperties = this.config.getEmrProperties();
    if (log.isDebugEnabled())
        log.debug("runDataPull -> json = " + json + " isStart = " + isStart);
    try {
        if (validateJson) {
            json = validateAndEnrich(json);
        }
        log.info("Running datapull for json : " + json + " cron expression = " + isStart + "env =" + env);
        final ObjectNode node = new ObjectMapper().readValue(json, ObjectNode.class);
        List<Map.Entry<String, JsonNode>> result = new LinkedList<Map.Entry<String, JsonNode>>();
        Iterator<Map.Entry<String, JsonNode>> nodes = node.fields();
        while (nodes.hasNext()) {
            result.add(nodes.next());
        }
        JsonNode clusterNode = result.stream().filter(y -> y.getKey().equalsIgnoreCase("cluster")).map(x -> x.getValue()).findAny().get();
        JsonNode migrationsNode = result.stream().filter(y -> y.getKey().equalsIgnoreCase("migrations")).map(x -> x.getValue()).findAny().get();
        if (clusterNode == null)
            throw new ProcessingException("Invalid Json!!! Cluster properties cannot be null");
        String creator = node.has(CREATOR) ? node.findValue(CREATOR).asText() : "";
        ObjectMapper mapper = new ObjectMapper();
        ClusterProperties reader = mapper.treeToValue(clusterNode, ClusterProperties.class);
        Migration[] myObjects = mapper.treeToValue(migrationsNode, Migration[].class);
        String cronExp = Objects.toString(reader.getCronExpression(), "");
        if (!cronExp.isEmpty())
            cronExp = validateAndProcessCronExpression(cronExp);
        String pipeline = Objects.toString(reader.getPipelineName(), UUID.randomUUID().toString());
        String pipelineEnv = Objects.toString(reader.getAwsEnv(), env);
        DataPullProperties dataPullProperties = config.getDataPullProperties();
        String applicationHistoryFolder = dataPullProperties.getApplicationHistoryFolder();
        String s3RepositoryBucketName = dataPullProperties.getS3BucketName();
        String jobName = pipelineEnv + PIPELINE_NAME_DELIMITER + EMR + PIPELINE_NAME_DELIMITER + pipeline + PIPELINE_NAME_DELIMITER + PIPELINE_NAME_SUFFIX;
        String applicationHistoryFolderPath = applicationHistoryFolder == null || applicationHistoryFolder.isEmpty() ? s3RepositoryBucketName + "/" + DATAPULL_HISTORY_FOLDER : applicationHistoryFolder;
        String bootstrapFilePath = s3RepositoryBucketName + "/" + BOOTSTRAP_FOLDER;
        String filePath = applicationHistoryFolderPath + "/" + jobName;
        String bootstrapFile = jobName + ".sh";
        String jksFilePath = bootstrapFilePath + "/" + bootstrapFile;
        String bootstrapActionStringFromUser = Objects.toString(reader.getBootstrapactionstring(), "");
        String defaultBootstrapString = emrProperties.getDefaultBootstrapString();
        Boolean haveBootstrapAction = createBootstrapScript(myObjects, bootstrapFile, bootstrapFilePath, bootstrapActionStringFromUser, defaultBootstrapString);
        DataPullTask task = createDataPullTask(filePath, jksFilePath, reader, jobName, creator, node.path("sparkjarfile").asText(), haveBootstrapAction);
        if (!isStart) {
            json = originalInputJson.equals(json) ? json : originalInputJson;
            saveConfig(applicationHistoryFolderPath, jobName + ".json", json);
        }
        if (!isStart && tasksMap.containsKey(jobName))
            cancelExistingTask(jobName);
        if (!(isStart && cronExp.isEmpty())) {
            Future<?> future = !cronExp.isEmpty() ? scheduler.schedule(task, new CronTrigger(cronExp)) : scheduler.schedule(task, new Date(System.currentTimeMillis() + 1 * 1000));
            tasksMap.put(jobName, future);
        }
    } catch (IOException e) {
        throw new ProcessingException("exception while starting datapull " + e.getLocalizedMessage());
    }
    if (log.isDebugEnabled())
        log.debug("runDataPull <- return");
}
Also used : java.util(java.util) DataPullClientService(com.homeaway.datapullclient.service.DataPullClientService) DataPullContextHolder(com.homeaway.datapullclient.config.DataPullContextHolder) Autowired(org.springframework.beans.factory.annotation.Autowired) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) ProcessingException(com.homeaway.datapullclient.exception.ProcessingException) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) EMRProperties(com.homeaway.datapullclient.config.EMRProperties) Value(org.springframework.beans.factory.annotation.Value) PathMatchingResourcePatternResolver(org.springframework.core.io.support.PathMatchingResourcePatternResolver) Future(java.util.concurrent.Future) JSONObject(org.json.JSONObject) ByteArrayInputStream(java.io.ByteArrayInputStream) JsonInputFile(com.homeaway.datapullclient.input.JsonInputFile) Service(org.springframework.stereotype.Service) AmazonS3(com.amazonaws.services.s3.AmazonS3) JsonNode(com.fasterxml.jackson.databind.JsonNode) ThreadPoolTaskScheduler(org.springframework.scheduling.concurrent.ThreadPoolTaskScheduler) SchemaLoader(org.everit.json.schema.loader.SchemaLoader) Resource(org.springframework.core.io.Resource) Migration(com.homeaway.datapullclient.input.Migration) ValidationException(org.everit.json.schema.ValidationException) Source(com.homeaway.datapullclient.input.Source) DataPullContext(com.homeaway.datapullclient.config.DataPullContext) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JSONTokener(org.json.JSONTokener) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) IOException(java.io.IOException) com.amazonaws.services.s3.model(com.amazonaws.services.s3.model) ClusterProperties(com.homeaway.datapullclient.input.ClusterProperties) InputStreamReader(java.io.InputStreamReader) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CronTrigger(org.springframework.scheduling.support.CronTrigger) InvalidPointedJsonException(com.homeaway.datapullclient.exception.InvalidPointedJsonException) Slf4j(lombok.extern.slf4j.Slf4j) PostConstruct(javax.annotation.PostConstruct) DataPullClientConfig(com.homeaway.datapullclient.config.DataPullClientConfig) Schema(org.everit.json.schema.Schema) ResourcePatternResolver(org.springframework.core.io.support.ResourcePatternResolver) BufferedReader(java.io.BufferedReader) CronTrigger(org.springframework.scheduling.support.CronTrigger) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) Migration(com.homeaway.datapullclient.input.Migration) JsonNode(com.fasterxml.jackson.databind.JsonNode) IOException(java.io.IOException) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) ClusterProperties(com.homeaway.datapullclient.input.ClusterProperties) EMRProperties(com.homeaway.datapullclient.config.EMRProperties) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) ProcessingException(com.homeaway.datapullclient.exception.ProcessingException)

Example 2 with DataPullProperties

use of com.homeaway.datapullclient.config.DataPullProperties in project datapull by homeaway.

the class DataPullTask method runTaskInNewCluster.

private RunJobFlowResult runTaskInNewCluster(final AmazonElasticMapReduce emr, final String logPath, final String jarPath, final String sparkSubmitParams, final Boolean haveBootstrapAction) {
    HadoopJarStepConfig runExampleConfig = null;
    if (sparkSubmitParams != null && !sparkSubmitParams.isEmpty()) {
        final List<String> sparkSubmitParamsList = this.prepareSparkSubmitParams(sparkSubmitParams);
        runExampleConfig = new HadoopJarStepConfig().withJar("command-runner.jar").withArgs(sparkSubmitParamsList);
    } else {
        runExampleConfig = new HadoopJarStepConfig().withJar("command-runner.jar").withArgs("spark-submit", "--conf", "spark.default.parallelism=3", "--conf", "spark.storage.blockManagerSlaveTimeoutMs=1200s", "--conf", "spark.executor.heartbeatInterval=900s", "--conf", "spark.driver.extraJavaOptions=-Djavax.net.ssl.trustStore=/etc/pki/java/cacerts/ -Djavax.net.ssl.trustStorePassword=changeit", "--conf", "spark.executor.extraJavaOptions=-Djavax.net.ssl.trustStore=/etc/pki/java/cacerts/ -Djavax.net.ssl.trustStorePassword=changeit", "--packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4,org.apache.spark:spark-avro_2.11:2.4.4", "--class", DataPullTask.MAIN_CLASS, jarPath, String.format(DataPullTask.JSON_WITH_INPUT_FILE_PATH, this.jsonS3Path));
    }
    final StepConfig customExampleStep = new StepConfig().withName(this.taskId).withActionOnFailure("CONTINUE").withHadoopJarStep(runExampleConfig);
    final Application spark = new Application().withName("Spark");
    final Application hive = new Application().withName("Hive");
    final EMRProperties emrProperties = this.config.getEmrProperties();
    final int instanceCount = emrProperties.getInstanceCount();
    final String masterType = emrProperties.getMasterType();
    final DataPullProperties datapullProperties = this.config.getDataPullProperties();
    final String applicationSubnet = Objects.toString(this.clusterProperties.getSubnetId(), datapullProperties.getApplicationSubnet1());
    final int count = Integer.valueOf(Objects.toString(this.clusterProperties.getEmrInstanceCount(), Integer.toString(instanceCount)));
    final JobFlowInstancesConfig jobConfig = new JobFlowInstancesConfig().withEc2KeyName(// passing invalid key will make the process terminate
    Objects.toString(this.clusterProperties.getEc2KeyName(), emrProperties.getEc2KeyName())).withEc2SubnetId(applicationSubnet).withMasterInstanceType(Objects.toString(this.clusterProperties.getMasterInstanceType(), masterType)).withInstanceCount(count).withKeepJobFlowAliveWhenNoSteps(!Boolean.valueOf(Objects.toString(this.clusterProperties.getTerminateClusterAfterExecution(), "true")));
    final String masterSG = emrProperties.getEmrSecurityGroupMaster();
    final String slaveSG = emrProperties.getEmrSecurityGroupSlave();
    final String serviceAccesss = emrProperties.getEmrSecurityGroupServiceAccess();
    final String masterSecurityGroup = Objects.toString(this.clusterProperties.getMasterSecurityGroup(), masterSG != null ? masterSG : "");
    final String slaveSecurityGroup = Objects.toString(this.clusterProperties.getSlaveSecurityGroup(), slaveSG != null ? slaveSG : "");
    final String serviceAccessSecurityGroup = Objects.toString(this.clusterProperties.getServiceAccessSecurityGroup(), serviceAccesss != null ? serviceAccesss : "");
    if (!masterSecurityGroup.isEmpty()) {
        jobConfig.withEmrManagedMasterSecurityGroup(masterSecurityGroup);
    }
    if (!slaveSecurityGroup.isEmpty()) {
        jobConfig.withEmrManagedSlaveSecurityGroup(slaveSecurityGroup);
    }
    if (!serviceAccessSecurityGroup.isEmpty()) {
        jobConfig.withServiceAccessSecurityGroup(serviceAccessSecurityGroup);
    }
    final String slaveType = emrProperties.getSlaveType();
    if (count > 1) {
        jobConfig.withSlaveInstanceType(Objects.toString(this.clusterProperties.getSlaveInstanceType(), slaveType));
    }
    this.addTagsToEMRCluster();
    final Map<String, String> sparkProperties = new HashMap<>();
    sparkProperties.put("maximizeResourceAllocation", "true");
    final String emrReleaseVersion = emrProperties.getEmrRelease();
    final String serviceRole = emrProperties.getServiceRole();
    final String jobFlowRole = emrProperties.getJobFlowRole();
    final String emrSecurityConfiguration = Objects.toString(clusterProperties.getEmr_security_configuration(), "");
    Map<String, String> emrfsProperties = new HashMap<String, String>();
    emrfsProperties.put("fs.s3.canned.acl", "BucketOwnerFullControl");
    Configuration myEmrfsConfig = new Configuration().withClassification("emrfs-site").withProperties(emrfsProperties);
    Map<String, String> sparkHiveProperties = emrProperties.getSparkHiveProperties().entrySet().stream().filter(keyVal -> !keyVal.getValue().isEmpty()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    sparkHiveProperties.putAll(this.clusterProperties.getSparkHiveProperties());
    Map<String, String> hiveProperties = emrProperties.getHiveProperties().entrySet().stream().filter(keyVal -> !keyVal.getValue().isEmpty()).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    hiveProperties.putAll(this.clusterProperties.getHiveProperties());
    Configuration sparkHiveConfig = new Configuration().withClassification("spark-hive-site").withProperties(sparkHiveProperties);
    Configuration hiveConfig = new Configuration().withClassification("hive-site").withProperties(hiveProperties);
    final RunJobFlowRequest request = new RunJobFlowRequest().withName(this.taskId).withReleaseLabel(Objects.toString(this.clusterProperties.getEmrReleaseVersion(), emrReleaseVersion)).withSteps(customExampleStep).withApplications(spark, hive).withLogUri(logPath).withServiceRole(Objects.toString(this.clusterProperties.getEmrServiceRole(), serviceRole)).withJobFlowRole(// addAdditionalInfoEntry("maximizeResourceAllocation", "true")
    Objects.toString(this.clusterProperties.getInstanceProfile(), jobFlowRole)).withVisibleToAllUsers(true).withTags(this.emrTags.values()).withConfigurations(new Configuration().withClassification("spark").withProperties(sparkProperties), myEmrfsConfig).withInstances(jobConfig);
    if (!hiveProperties.isEmpty()) {
        request.withConfigurations(hiveConfig);
    }
    if (!sparkHiveProperties.isEmpty()) {
        request.withConfigurations(sparkHiveConfig);
    }
    if (!emrSecurityConfiguration.isEmpty()) {
        request.withSecurityConfiguration(emrSecurityConfiguration);
    }
    if (haveBootstrapAction) {
        final BootstrapActionConfig bsConfig = new BootstrapActionConfig();
        bsConfig.setName("bootstrapaction");
        bsConfig.setScriptBootstrapAction(new ScriptBootstrapActionConfig().withPath("s3://" + this.jksS3Path));
        request.withBootstrapActions(bsConfig);
    }
    return emr.runJobFlow(request);
}
Also used : AmazonElasticMapReduce(com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduce) Slf4j(lombok.extern.slf4j.Slf4j) java.util(java.util) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) DataPullClientConfig(com.homeaway.datapullclient.config.DataPullClientConfig) Autowired(org.springframework.beans.factory.annotation.Autowired) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) ClusterProperties(com.homeaway.datapullclient.input.ClusterProperties) com.amazonaws.services.elasticmapreduce.model(com.amazonaws.services.elasticmapreduce.model) Collectors(java.util.stream.Collectors) EMRProperties(com.homeaway.datapullclient.config.EMRProperties) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) EMRProperties(com.homeaway.datapullclient.config.EMRProperties)

Example 3 with DataPullProperties

use of com.homeaway.datapullclient.config.DataPullProperties in project datapull by homeaway.

the class DataPullRequestProcessor method getPendingTaskNames.

private List<String> getPendingTaskNames() {
    if (log.isDebugEnabled())
        log.debug("getPendingTaskNames -> ");
    AmazonS3 s3Client = config.getS3Client();
    DataPullProperties dataPullProperties = config.getDataPullProperties();
    String applicationHistoryFolder = dataPullProperties.getApplicationHistoryFolder();
    String s3RepositoryBucketName = dataPullProperties.getS3BucketName();
    String applicationHistoryFolderPath = applicationHistoryFolder == null || applicationHistoryFolder.equals("") ? s3RepositoryBucketName + "/" + DATAPULL_HISTORY_FOLDER : applicationHistoryFolder;
    String historyFolderPrefix = applicationHistoryFolderPath.substring(applicationHistoryFolderPath.indexOf("/") + 1);
    ListObjectsRequest listObjectsRequest = new ListObjectsRequest().withBucketName(s3RepositoryBucketName).withPrefix(historyFolderPrefix + "/").withDelimiter("/");
    ObjectListing objectListing = s3Client.listObjects(listObjectsRequest);
    // ObjectListing objectListing = s3Client.listObjects(s3RepositoryBucketName);
    List<String> fileNames = new ArrayList<>();
    while (true) {
        List<String> fn = objectListing.getObjectSummaries().stream().filter(x -> !x.getKey().isEmpty() && x.getKey().endsWith(".json")).map(x -> x.getKey().substring(x.getKey().indexOf("/") + 1)).collect(Collectors.toList());
        fileNames.addAll(fn);
        if (objectListing.isTruncated()) {
            objectListing = s3Client.listNextBatchOfObjects(objectListing);
        } else {
            break;
        }
    }
    log.info("pending task names  = " + fileNames);
    if (log.isDebugEnabled())
        log.debug("getPendingTaskNames <- return " + fileNames);
    return fileNames;
}
Also used : java.util(java.util) DataPullClientService(com.homeaway.datapullclient.service.DataPullClientService) DataPullContextHolder(com.homeaway.datapullclient.config.DataPullContextHolder) Autowired(org.springframework.beans.factory.annotation.Autowired) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties) ProcessingException(com.homeaway.datapullclient.exception.ProcessingException) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) EMRProperties(com.homeaway.datapullclient.config.EMRProperties) Value(org.springframework.beans.factory.annotation.Value) PathMatchingResourcePatternResolver(org.springframework.core.io.support.PathMatchingResourcePatternResolver) Future(java.util.concurrent.Future) JSONObject(org.json.JSONObject) ByteArrayInputStream(java.io.ByteArrayInputStream) JsonInputFile(com.homeaway.datapullclient.input.JsonInputFile) Service(org.springframework.stereotype.Service) AmazonS3(com.amazonaws.services.s3.AmazonS3) JsonNode(com.fasterxml.jackson.databind.JsonNode) ThreadPoolTaskScheduler(org.springframework.scheduling.concurrent.ThreadPoolTaskScheduler) SchemaLoader(org.everit.json.schema.loader.SchemaLoader) Resource(org.springframework.core.io.Resource) Migration(com.homeaway.datapullclient.input.Migration) ValidationException(org.everit.json.schema.ValidationException) Source(com.homeaway.datapullclient.input.Source) DataPullContext(com.homeaway.datapullclient.config.DataPullContext) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JSONTokener(org.json.JSONTokener) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) IOException(java.io.IOException) com.amazonaws.services.s3.model(com.amazonaws.services.s3.model) ClusterProperties(com.homeaway.datapullclient.input.ClusterProperties) InputStreamReader(java.io.InputStreamReader) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CronTrigger(org.springframework.scheduling.support.CronTrigger) InvalidPointedJsonException(com.homeaway.datapullclient.exception.InvalidPointedJsonException) Slf4j(lombok.extern.slf4j.Slf4j) PostConstruct(javax.annotation.PostConstruct) DataPullClientConfig(com.homeaway.datapullclient.config.DataPullClientConfig) Schema(org.everit.json.schema.Schema) ResourcePatternResolver(org.springframework.core.io.support.ResourcePatternResolver) BufferedReader(java.io.BufferedReader) AmazonS3(com.amazonaws.services.s3.AmazonS3) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties)

Example 4 with DataPullProperties

use of com.homeaway.datapullclient.config.DataPullProperties in project datapull by homeaway.

the class DataPullRequestProcessor method readAndExcecuteInputJson.

private String readAndExcecuteInputJson(String fileName) throws ProcessingException {
    if (log.isDebugEnabled())
        log.debug("readAndExcecuteInputJson -> fileName=" + fileName);
    DataPullProperties dataPullProperties = config.getDataPullProperties();
    String applicationHistoryFolder = dataPullProperties.getApplicationHistoryFolder();
    String s3RepositoryBucketName = dataPullProperties.getS3BucketName();
    AmazonS3 s3Client = config.getS3Client();
    String applicationHistoryFolderPath = applicationHistoryFolder == null || applicationHistoryFolder.equals("") ? s3RepositoryBucketName + "/" + DATAPULL_HISTORY_FOLDER : applicationHistoryFolder;
    String result = readFileFromS3(s3Client, applicationHistoryFolderPath, fileName.substring(fileName.indexOf("/") + 1));
    runDataPull(result, true, false);
    if (log.isDebugEnabled())
        log.debug("readAndExcecuteInputJson <- return=" + result);
    return result;
}
Also used : AmazonS3(com.amazonaws.services.s3.AmazonS3) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties)

Example 5 with DataPullProperties

use of com.homeaway.datapullclient.config.DataPullProperties in project datapull by homeaway.

the class DataPullTask method runSparkCluster.

private void runSparkCluster() {
    DataPullTask.log.info("Started cluster config taskId = " + this.taskId);
    final AmazonElasticMapReduce emr = this.config.getEMRClient();
    final int MAX_RETRY = 16;
    final DataPullProperties dataPullProperties = this.config.getDataPullProperties();
    final String logFilePath = dataPullProperties.getLogFilePath();
    final String s3RepositoryBucketName = dataPullProperties.getS3BucketName();
    final String logPath = logFilePath == null || logFilePath.equals("") ? "s3://" + s3RepositoryBucketName + "/" + "datapull-opensource/logs/SparkLogs" : logFilePath;
    s3JarPath = s3JarPath == null || s3JarPath.equals("") ? "s3://" + s3RepositoryBucketName + "/" + "datapull-opensource/jars/DataMigrationFramework-1.0-SNAPSHOT-jar-with-dependencies.jar" : s3JarPath;
    List<ClusterSummary> clusters = new ArrayList<>();
    ListClustersRequest listClustersRequest = new ListClustersRequest();
    // Only get clusters that are in usable state
    listClustersRequest.setClusterStates(Arrays.asList(ClusterState.RUNNING.toString(), ClusterState.WAITING.toString(), ClusterState.STARTING.toString()));
    ListClustersResult listClustersResult = retryListClusters(emr, MAX_RETRY, listClustersRequest);
    while (true) {
        for (ClusterSummary cluster : listClustersResult.getClusters()) {
            if (cluster.getName().toLowerCase().equals(this.taskId.toLowerCase())) {
                clusters.add(cluster);
            }
        }
        if (listClustersResult.getMarker() != null) {
            listClustersRequest.setMarker(listClustersResult.getMarker());
            listClustersResult = retryListClusters(emr, MAX_RETRY, listClustersRequest);
        } else {
            break;
        }
    }
    // find all datapull EMR clusters to be reaped
    List<ClusterSummary> reapClusters = new ArrayList<>();
    Calendar cal = Calendar.getInstance();
    cal.add(Calendar.DATE, -2);
    listClustersRequest.setClusterStates(Arrays.asList(ClusterState.WAITING.toString()));
    listClustersRequest.setCreatedBefore(cal.getTime());
    listClustersResult = retryListClusters(emr, MAX_RETRY, listClustersRequest);
    while (true) {
        for (ClusterSummary cluster : listClustersResult.getClusters()) {
            if (cluster.getName().matches(".*-emr-.*-pipeline")) {
                ListStepsRequest listSteps = new ListStepsRequest().withClusterId(cluster.getId());
                ListStepsResult steps = retryListSteps(emr, MAX_RETRY, listSteps);
                Date maxStepEndTime = new Date(0);
                while (true) {
                    for (StepSummary step : steps.getSteps()) {
                        Date stepEndDate = step.getStatus().getTimeline().getEndDateTime();
                        if (stepEndDate != null && stepEndDate.after(maxStepEndTime)) {
                            maxStepEndTime = stepEndDate;
                        }
                    }
                    if (steps.getMarker() != null) {
                        listSteps.setMarker(steps.getMarker());
                        steps = retryListSteps(emr, MAX_RETRY, listSteps);
                    } else {
                        break;
                    }
                }
                if (maxStepEndTime.before(cal.getTime())) {
                    reapClusters.add(cluster);
                }
            }
        }
        if (listClustersResult.getMarker() != null) {
            listClustersRequest.setMarker(listClustersResult.getMarker());
            listClustersResult = retryListClusters(emr, MAX_RETRY, listClustersRequest);
        } else {
            break;
        }
    }
    DataPullTask.log.info("Number of useable clusters for taskId " + this.taskId + " = " + clusters.size());
    DataPullTask.log.info("Number of reapable clusters = " + reapClusters.size());
    if (!clusters.isEmpty()) {
        final ClusterSummary summary = clusters.get(0);
        final Boolean forceRestart = clusterProperties.getForceRestart();
        if (summary != null && !forceRestart) {
            this.runTaskOnExistingCluster(summary.getId(), this.s3JarPath, Boolean.valueOf(Objects.toString(this.clusterProperties.getTerminateClusterAfterExecution(), "false")), Objects.toString(this.clusterProperties.getSparksubmitparams(), ""));
        } else if (summary != null && forceRestart) {
            emr.terminateJobFlows(new TerminateJobFlowsRequest().withJobFlowIds(summary.getId()));
            DataPullTask.log.info("Task " + this.taskId + " is forced to be terminated");
            this.runTaskInNewCluster(emr, logPath, this.s3JarPath, Objects.toString(this.clusterProperties.getSparksubmitparams(), ""), haveBootstrapAction);
            DataPullTask.log.info("Task " + this.taskId + " submitted to EMR cluster");
        }
    } else {
        final RunJobFlowResult result = this.runTaskInNewCluster(emr, logPath, this.s3JarPath, Objects.toString(this.clusterProperties.getSparksubmitparams(), ""), haveBootstrapAction);
    }
    DataPullTask.log.info("Task " + this.taskId + " submitted to EMR cluster");
    if (!reapClusters.isEmpty()) {
        reapClusters.forEach(cluster -> {
            String clusterIdToReap = cluster.getId();
            String clusterNameToReap = cluster.getName();
            // ensure we don't reap the cluster we just used
            if (!clusters.isEmpty() && clusters.get(0).getId().equals(clusterIdToReap)) {
                DataPullTask.log.info("Cannot reap in-use cluster " + clusterNameToReap + " with Id " + clusterIdToReap);
            } else {
                DataPullTask.log.info("About to reap cluster " + clusterNameToReap + " with Id " + clusterIdToReap);
                emr.terminateJobFlows(new TerminateJobFlowsRequest().withJobFlowIds(Arrays.asList(clusterIdToReap)));
                DataPullTask.log.info("Reaped cluster " + clusterNameToReap + " with Id " + clusterIdToReap);
            }
        });
    }
}
Also used : AmazonElasticMapReduce(com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduce) DataPullProperties(com.homeaway.datapullclient.config.DataPullProperties)

Aggregations

DataPullProperties (com.homeaway.datapullclient.config.DataPullProperties)6 EMRProperties (com.homeaway.datapullclient.config.EMRProperties)4 AmazonS3 (com.amazonaws.services.s3.AmazonS3)3 DataPullClientConfig (com.homeaway.datapullclient.config.DataPullClientConfig)3 ClusterProperties (com.homeaway.datapullclient.input.ClusterProperties)3 java.util (java.util)3 Collectors (java.util.stream.Collectors)3 Slf4j (lombok.extern.slf4j.Slf4j)3 Autowired (org.springframework.beans.factory.annotation.Autowired)3 AmazonElasticMapReduce (com.amazonaws.services.elasticmapreduce.AmazonElasticMapReduce)2 com.amazonaws.services.s3.model (com.amazonaws.services.s3.model)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 ObjectNode (com.fasterxml.jackson.databind.node.ObjectNode)2 DataPullContext (com.homeaway.datapullclient.config.DataPullContext)2 DataPullContextHolder (com.homeaway.datapullclient.config.DataPullContextHolder)2 InvalidPointedJsonException (com.homeaway.datapullclient.exception.InvalidPointedJsonException)2 ProcessingException (com.homeaway.datapullclient.exception.ProcessingException)2 JsonInputFile (com.homeaway.datapullclient.input.JsonInputFile)2 Migration (com.homeaway.datapullclient.input.Migration)2