Search in sources :

Example 6 with JobExecution

use of models.JobExecution in project dr-elephant by linkedin.

the class AzkabanJobCompleteDetector method getCompletedExecutions.

/**
 * Returns the list of completed executions
 * @param jobExecutions Started Execution list
 * @return List of completed executions
 * @throws MalformedURLException
 * @throws URISyntaxException
 */
protected List<TuningJobExecution> getCompletedExecutions(List<TuningJobExecution> jobExecutions) throws MalformedURLException, URISyntaxException {
    logger.info("Fetching the list of executions completed since last iteration");
    List<TuningJobExecution> completedExecutions = new ArrayList<TuningJobExecution>();
    try {
        for (TuningJobExecution tuningJobExecution : jobExecutions) {
            JobExecution jobExecution = tuningJobExecution.jobExecution;
            logger.info("Checking current status of started execution: " + tuningJobExecution.jobExecution.jobExecId);
            if (_azkabanJobStatusUtil == null) {
                logger.info("Initializing  AzkabanJobStatusUtil");
                _azkabanJobStatusUtil = new AzkabanJobStatusUtil();
            }
            try {
                Map<String, String> jobStatus = _azkabanJobStatusUtil.getJobsFromFlow(jobExecution.flowExecution.flowExecId);
                if (jobStatus != null) {
                    for (Map.Entry<String, String> job : jobStatus.entrySet()) {
                        logger.info("Job Found:" + job.getKey() + ". Status: " + job.getValue());
                        if (job.getKey().equals(jobExecution.job.jobName)) {
                            if (job.getValue().equals(AzkabanJobStatus.FAILED.toString())) {
                                tuningJobExecution.paramSetState = ParamSetStatus.EXECUTED;
                                jobExecution.executionState = ExecutionState.FAILED;
                            }
                            if (job.getValue().equals(AzkabanJobStatus.CANCELLED.toString()) || job.getValue().equals(AzkabanJobStatus.KILLED.toString())) {
                                tuningJobExecution.paramSetState = ParamSetStatus.EXECUTED;
                                jobExecution.executionState = ExecutionState.CANCELLED;
                            }
                            if (job.getValue().equals(AzkabanJobStatus.SUCCEEDED.toString())) {
                                tuningJobExecution.paramSetState = ParamSetStatus.EXECUTED;
                                jobExecution.executionState = ExecutionState.SUCCEEDED;
                            }
                            if (tuningJobExecution.paramSetState.equals(ParamSetStatus.EXECUTED)) {
                                completedExecutions.add(tuningJobExecution);
                                logger.info("Execution " + tuningJobExecution.jobExecution.jobExecId + " is completed");
                            } else {
                                logger.info("Execution " + tuningJobExecution.jobExecution.jobExecId + " is still in running state");
                            }
                        }
                    }
                } else {
                    logger.info("No jobs found for flow execution: " + jobExecution.flowExecution.flowExecId);
                }
            } catch (Exception e) {
                logger.error("Error in checking status of execution: " + jobExecution.jobExecId, e);
            }
        }
    } catch (Exception e) {
        logger.error("Error in fetching list of completed executions", e);
        e.printStackTrace();
    }
    logger.info("Number of executions completed since last iteration: " + completedExecutions.size());
    return completedExecutions;
}
Also used : TuningJobExecution(models.TuningJobExecution) JobExecution(models.JobExecution) AzkabanJobStatusUtil(com.linkedin.drelephant.clients.azkaban.AzkabanJobStatusUtil) ArrayList(java.util.ArrayList) TuningJobExecution(models.TuningJobExecution) Map(java.util.Map) MalformedURLException(java.net.MalformedURLException) URISyntaxException(java.net.URISyntaxException)

Example 7 with JobExecution

use of models.JobExecution in project dr-elephant by linkedin.

the class FitnessComputeUtil method updateExecutionMetrics.

/**
 * Updates the execution metrics
 * @param completedExecutions List of completed executions
 */
protected void updateExecutionMetrics(List<TuningJobExecution> completedExecutions) {
    for (TuningJobExecution tuningJobExecution : completedExecutions) {
        logger.info("Updating execution metrics and fitness for execution: " + tuningJobExecution.jobExecution.jobExecId);
        try {
            JobExecution jobExecution = tuningJobExecution.jobExecution;
            JobDefinition job = jobExecution.job;
            // job id match and tuning enabled
            TuningJobDefinition tuningJobDefinition = TuningJobDefinition.find.select("*").fetch(TuningJobDefinition.TABLE.job, "*").where().eq(TuningJobDefinition.TABLE.job + "." + JobDefinition.TABLE.id, job.id).eq(TuningJobDefinition.TABLE.tuningEnabled, 1).findUnique();
            List<AppResult> results = AppResult.find.select("*").fetch(AppResult.TABLE.APP_HEURISTIC_RESULTS, "*").fetch(AppResult.TABLE.APP_HEURISTIC_RESULTS + "." + AppHeuristicResult.TABLE.APP_HEURISTIC_RESULT_DETAILS, "*").where().eq(AppResult.TABLE.FLOW_EXEC_ID, jobExecution.flowExecution.flowExecId).eq(AppResult.TABLE.JOB_EXEC_ID, jobExecution.jobExecId).findList();
            if (results != null && results.size() > 0) {
                Long totalExecutionTime = 0L;
                Double totalResourceUsed = 0D;
                Double totalInputBytesInBytes = 0D;
                for (AppResult appResult : results) {
                    totalResourceUsed += appResult.resourceUsed;
                    totalInputBytesInBytes += getTotalInputBytes(appResult);
                }
                Long totalRunTime = Utils.getTotalRuntime(results);
                Long totalDelay = Utils.getTotalWaittime(results);
                totalExecutionTime = totalRunTime - totalDelay;
                if (totalExecutionTime != 0) {
                    jobExecution.executionTime = totalExecutionTime * 1.0 / (1000 * 60);
                    jobExecution.resourceUsage = totalResourceUsed * 1.0 / (1024 * 3600);
                    jobExecution.inputSizeInBytes = totalInputBytesInBytes;
                    logger.info("Metric Values for execution " + jobExecution.jobExecId + ": Execution time = " + totalExecutionTime + ", Resource usage = " + totalResourceUsed + " and total input size = " + totalInputBytesInBytes);
                }
                if (tuningJobDefinition.averageResourceUsage == null && totalExecutionTime != 0) {
                    tuningJobDefinition.averageResourceUsage = jobExecution.resourceUsage;
                    tuningJobDefinition.averageExecutionTime = jobExecution.executionTime;
                    tuningJobDefinition.averageInputSizeInBytes = jobExecution.inputSizeInBytes.longValue();
                    tuningJobDefinition.update();
                }
                // Compute fitness
                if (jobExecution.executionState.equals(JobExecution.ExecutionState.FAILED) || jobExecution.executionState.equals(JobExecution.ExecutionState.CANCELLED)) {
                    logger.info("Execution " + jobExecution.jobExecId + " failed/cancelled. Applying penalty");
                    // Todo: Check if the reason of failure is auto tuning and  handle cancelled cases
                    tuningJobExecution.fitness = 3 * tuningJobDefinition.averageResourceUsage * tuningJobDefinition.allowedMaxResourceUsagePercent * FileUtils.ONE_GB / (100.0 * tuningJobDefinition.averageInputSizeInBytes);
                } else if (jobExecution.resourceUsage > (// Todo: Check execution time constraint as well
                tuningJobDefinition.averageResourceUsage * tuningJobDefinition.allowedMaxResourceUsagePercent / 100.0)) {
                    logger.info("Execution " + jobExecution.jobExecId + " violates constraint on resource usage");
                    tuningJobExecution.fitness = 3 * tuningJobDefinition.averageResourceUsage * tuningJobDefinition.allowedMaxResourceUsagePercent * FileUtils.ONE_GB / (100.0 * totalInputBytesInBytes);
                } else {
                    tuningJobExecution.fitness = jobExecution.resourceUsage * FileUtils.ONE_GB / totalInputBytesInBytes;
                }
                tuningJobExecution.paramSetState = ParamSetStatus.FITNESS_COMPUTED;
                jobExecution.update();
                tuningJobExecution.update();
            } else {
                if (jobExecution.executionState.equals(JobExecution.ExecutionState.FAILED) || jobExecution.executionState.equals(JobExecution.ExecutionState.CANCELLED)) {
                    // Todo: Check if the reason of failure is auto tuning and  handle cancelled cases
                    tuningJobExecution.fitness = 3 * tuningJobDefinition.averageResourceUsage * tuningJobDefinition.allowedMaxResourceUsagePercent * FileUtils.ONE_GB / (100.0 * tuningJobDefinition.averageInputSizeInBytes);
                    jobExecution.executionTime = 0D;
                    jobExecution.resourceUsage = 0D;
                    jobExecution.inputSizeInBytes = 0D;
                    tuningJobExecution.paramSetState = ParamSetStatus.FITNESS_COMPUTED;
                    jobExecution.update();
                    tuningJobExecution.update();
                }
            }
        } catch (Exception e) {
            logger.error("Error updating fitness of execution: " + tuningJobExecution.jobExecution.id + "\n Stacktrace: ", e);
        }
    }
    logger.info("Execution metrics updated");
}
Also used : TuningJobExecution(models.TuningJobExecution) JobExecution(models.JobExecution) TuningJobExecution(models.TuningJobExecution) TuningJobDefinition(models.TuningJobDefinition) JobDefinition(models.JobDefinition) TuningJobDefinition(models.TuningJobDefinition) AppResult(models.AppResult)

Aggregations

JobExecution (models.JobExecution)7 TuningJobExecution (models.TuningJobExecution)7 FlowExecution (models.FlowExecution)2 JobDefinition (models.JobDefinition)2 TuningJobDefinition (models.TuningJobDefinition)2 AzkabanJobStatusUtil (com.linkedin.drelephant.clients.azkaban.AzkabanJobStatusUtil)1 IOException (java.io.IOException)1 HttpURLConnection (java.net.HttpURLConnection)1 MalformedURLException (java.net.MalformedURLException)1 URISyntaxException (java.net.URISyntaxException)1 URL (java.net.URL)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 AppResult (models.AppResult)1 JobSuggestedParamValue (models.JobSuggestedParamValue)1 AuthenticatedURL (org.apache.hadoop.security.authentication.client.AuthenticatedURL)1 AuthenticationException (org.apache.hadoop.security.authentication.client.AuthenticationException)1 JsonNode (org.codehaus.jackson.JsonNode)1