Search in sources :

Example 11 with TuningJobExecution

use of models.TuningJobExecution in project dr-elephant by linkedin.

the class FitnessComputeUtil method getCompletedExecutions.

/**
 * Returns the list of completed executions whose metrics are not computed
 * @return List of job execution
 */
private List<TuningJobExecution> getCompletedExecutions() {
    logger.info("Fetching completed executions whose fitness are yet to be computed");
    List<TuningJobExecution> jobExecutions = new ArrayList<TuningJobExecution>();
    List<TuningJobExecution> outputJobExecutions = new ArrayList<TuningJobExecution>();
    try {
        jobExecutions = TuningJobExecution.find.select("*").where().eq(TuningJobExecution.TABLE.paramSetState, ParamSetStatus.EXECUTED).findList();
        for (TuningJobExecution tuningJobExecution : jobExecutions) {
            long diff = System.currentTimeMillis() - tuningJobExecution.jobExecution.updatedTs.getTime();
            logger.debug("Current Time in millis: " + System.currentTimeMillis() + ", Job execution last updated time " + tuningJobExecution.jobExecution.updatedTs.getTime());
            if (diff < waitInterval) {
                logger.debug("Delaying fitness compute for execution: " + tuningJobExecution.jobExecution.jobExecId);
            } else {
                logger.debug("Adding execution " + tuningJobExecution.jobExecution.jobExecId + " for fitness computation");
                outputJobExecutions.add(tuningJobExecution);
            }
        }
    } catch (NullPointerException e) {
        logger.error("No completed execution found for which fitness is to be computed", e);
    }
    logger.info("Number of completed execution fetched for fitness computation: " + outputJobExecutions.size());
    logger.debug("Finished fetching completed executions for fitness computation");
    return outputJobExecutions;
}
Also used : ArrayList(java.util.ArrayList) TuningJobExecution(models.TuningJobExecution)

Example 12 with TuningJobExecution

use of models.TuningJobExecution in project dr-elephant by linkedin.

the class FitnessComputeUtil method updateExecutionMetrics.

/**
 * Updates the execution metrics
 * @param completedExecutions List of completed executions
 */
protected void updateExecutionMetrics(List<TuningJobExecution> completedExecutions) {
    for (TuningJobExecution tuningJobExecution : completedExecutions) {
        logger.info("Updating execution metrics and fitness for execution: " + tuningJobExecution.jobExecution.jobExecId);
        try {
            JobExecution jobExecution = tuningJobExecution.jobExecution;
            JobDefinition job = jobExecution.job;
            // job id match and tuning enabled
            TuningJobDefinition tuningJobDefinition = TuningJobDefinition.find.select("*").fetch(TuningJobDefinition.TABLE.job, "*").where().eq(TuningJobDefinition.TABLE.job + "." + JobDefinition.TABLE.id, job.id).eq(TuningJobDefinition.TABLE.tuningEnabled, 1).findUnique();
            List<AppResult> results = AppResult.find.select("*").fetch(AppResult.TABLE.APP_HEURISTIC_RESULTS, "*").fetch(AppResult.TABLE.APP_HEURISTIC_RESULTS + "." + AppHeuristicResult.TABLE.APP_HEURISTIC_RESULT_DETAILS, "*").where().eq(AppResult.TABLE.FLOW_EXEC_ID, jobExecution.flowExecution.flowExecId).eq(AppResult.TABLE.JOB_EXEC_ID, jobExecution.jobExecId).findList();
            if (results != null && results.size() > 0) {
                Long totalExecutionTime = 0L;
                Double totalResourceUsed = 0D;
                Double totalInputBytesInBytes = 0D;
                for (AppResult appResult : results) {
                    totalResourceUsed += appResult.resourceUsed;
                    totalInputBytesInBytes += getTotalInputBytes(appResult);
                }
                Long totalRunTime = Utils.getTotalRuntime(results);
                Long totalDelay = Utils.getTotalWaittime(results);
                totalExecutionTime = totalRunTime - totalDelay;
                if (totalExecutionTime != 0) {
                    jobExecution.executionTime = totalExecutionTime * 1.0 / (1000 * 60);
                    jobExecution.resourceUsage = totalResourceUsed * 1.0 / (1024 * 3600);
                    jobExecution.inputSizeInBytes = totalInputBytesInBytes;
                    logger.info("Metric Values for execution " + jobExecution.jobExecId + ": Execution time = " + totalExecutionTime + ", Resource usage = " + totalResourceUsed + " and total input size = " + totalInputBytesInBytes);
                }
                if (tuningJobDefinition.averageResourceUsage == null && totalExecutionTime != 0) {
                    tuningJobDefinition.averageResourceUsage = jobExecution.resourceUsage;
                    tuningJobDefinition.averageExecutionTime = jobExecution.executionTime;
                    tuningJobDefinition.averageInputSizeInBytes = jobExecution.inputSizeInBytes.longValue();
                    tuningJobDefinition.update();
                }
                // Compute fitness
                if (jobExecution.executionState.equals(JobExecution.ExecutionState.FAILED) || jobExecution.executionState.equals(JobExecution.ExecutionState.CANCELLED)) {
                    logger.info("Execution " + jobExecution.jobExecId + " failed/cancelled. Applying penalty");
                    // Todo: Check if the reason of failure is auto tuning and  handle cancelled cases
                    tuningJobExecution.fitness = 3 * tuningJobDefinition.averageResourceUsage * tuningJobDefinition.allowedMaxResourceUsagePercent * FileUtils.ONE_GB / (100.0 * tuningJobDefinition.averageInputSizeInBytes);
                } else if (jobExecution.resourceUsage > (// Todo: Check execution time constraint as well
                tuningJobDefinition.averageResourceUsage * tuningJobDefinition.allowedMaxResourceUsagePercent / 100.0)) {
                    logger.info("Execution " + jobExecution.jobExecId + " violates constraint on resource usage");
                    tuningJobExecution.fitness = 3 * tuningJobDefinition.averageResourceUsage * tuningJobDefinition.allowedMaxResourceUsagePercent * FileUtils.ONE_GB / (100.0 * totalInputBytesInBytes);
                } else {
                    tuningJobExecution.fitness = jobExecution.resourceUsage * FileUtils.ONE_GB / totalInputBytesInBytes;
                }
                tuningJobExecution.paramSetState = ParamSetStatus.FITNESS_COMPUTED;
                jobExecution.update();
                tuningJobExecution.update();
            } else {
                if (jobExecution.executionState.equals(JobExecution.ExecutionState.FAILED) || jobExecution.executionState.equals(JobExecution.ExecutionState.CANCELLED)) {
                    // Todo: Check if the reason of failure is auto tuning and  handle cancelled cases
                    tuningJobExecution.fitness = 3 * tuningJobDefinition.averageResourceUsage * tuningJobDefinition.allowedMaxResourceUsagePercent * FileUtils.ONE_GB / (100.0 * tuningJobDefinition.averageInputSizeInBytes);
                    jobExecution.executionTime = 0D;
                    jobExecution.resourceUsage = 0D;
                    jobExecution.inputSizeInBytes = 0D;
                    tuningJobExecution.paramSetState = ParamSetStatus.FITNESS_COMPUTED;
                    jobExecution.update();
                    tuningJobExecution.update();
                }
            }
        } catch (Exception e) {
            logger.error("Error updating fitness of execution: " + tuningJobExecution.jobExecution.id + "\n Stacktrace: ", e);
        }
    }
    logger.info("Execution metrics updated");
}
Also used : TuningJobExecution(models.TuningJobExecution) JobExecution(models.JobExecution) TuningJobExecution(models.TuningJobExecution) TuningJobDefinition(models.TuningJobDefinition) JobDefinition(models.JobDefinition) TuningJobDefinition(models.TuningJobDefinition) AppResult(models.AppResult)

Aggregations

TuningJobExecution (models.TuningJobExecution)12 JobExecution (models.JobExecution)7 TuningJobDefinition (models.TuningJobDefinition)4 JobDefinition (models.JobDefinition)3 ArrayList (java.util.ArrayList)2 FlowExecution (models.FlowExecution)2 JobSuggestedParamValue (models.JobSuggestedParamValue)2 JsonNode (com.fasterxml.jackson.databind.JsonNode)1 AzkabanJobStatusUtil (com.linkedin.drelephant.clients.azkaban.AzkabanJobStatusUtil)1 FitnessComputeUtil (com.linkedin.drelephant.tuning.FitnessComputeUtil)1 IOException (java.io.IOException)1 HttpURLConnection (java.net.HttpURLConnection)1 MalformedURLException (java.net.MalformedURLException)1 URISyntaxException (java.net.URISyntaxException)1 URL (java.net.URL)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 AppResult (models.AppResult)1 FlowDefinition (models.FlowDefinition)1 Configuration (org.apache.hadoop.conf.Configuration)1