Search in sources :

Example 1 with SparkCounter

use of org.apache.hive.spark.counter.SparkCounter in project hive by apache.

the class SparkTask method execute.

@Override
public int execute() {
    int rc = 0;
    perfLogger = SessionState.getPerfLogger();
    SparkSession sparkSession = null;
    SparkSessionManager sparkSessionManager = null;
    try {
        printConfigInfo();
        sparkSessionManager = SparkSessionManagerImpl.getInstance();
        sparkSession = SparkUtilities.getSparkSession(conf, sparkSessionManager);
        SparkWork sparkWork = getWork();
        sparkWork.setRequiredCounterPrefix(getOperatorCounters());
        // Submit the Spark job
        perfLogger.perfLogBegin(CLASS_NAME, PerfLogger.SPARK_SUBMIT_JOB);
        submitTime = perfLogger.getStartTime(PerfLogger.SPARK_SUBMIT_JOB);
        jobRef = sparkSession.submit(taskQueue, context, sparkWork);
        perfLogger.perfLogEnd(CLASS_NAME, PerfLogger.SPARK_SUBMIT_JOB);
        // If the driver context has been shutdown (due to query cancellation) kill the Spark job
        if (taskQueue.isShutdown()) {
            LOG.warn("Killing Spark job");
            killJob();
            throw new HiveException(String.format("Spark task %s cancelled for query %s", getId(), sparkWork.getQueryId()));
        }
        // Get the Job Handle id associated with the Spark job
        sparkJobHandleId = jobRef.getJobId();
        // Add Spark job handle id to the Hive History
        addToHistory(Keys.SPARK_JOB_HANDLE_ID, jobRef.getJobId());
        LOG.debug("Starting Spark job with job handle id " + sparkJobHandleId);
        // Get the application id of the Spark app
        jobID = jobRef.getSparkJobStatus().getAppID();
        // Start monitoring the Spark job, returns when the Spark job has completed / failed, or if
        // a timeout occurs
        rc = jobRef.monitorJob();
        // Get the id the Spark job that was launched, returns -1 if no Spark job was launched
        sparkJobID = jobRef.getSparkJobStatus().getJobId();
        // Add Spark job id to the Hive History
        addToHistory(Keys.SPARK_JOB_ID, Integer.toString(sparkJobID));
        // Get the final state of the Spark job and parses its job info
        SparkJobStatus sparkJobStatus = jobRef.getSparkJobStatus();
        getSparkJobInfo(sparkJobStatus);
        setSparkException(sparkJobStatus, rc);
        if (rc == 0) {
            sparkStatistics = sparkJobStatus.getSparkStatistics();
            if (SessionState.get() != null) {
                // Set the number of rows written in case of insert queries, to print in the client(beeline).
                SparkCounters counters = sparkJobStatus.getCounter();
                if (counters != null) {
                    SparkCounter counter = counters.getCounter(HiveConf.getVar(conf, HiveConf.ConfVars.HIVECOUNTERGROUP), FileSinkOperator.TOTAL_TABLE_ROWS_WRITTEN);
                    if (counter != null) {
                        queryState.setNumModifiedRows(counter.getValue());
                    }
                }
            }
            printConsoleMetrics();
            printExcessiveGCWarning();
            if (LOG.isInfoEnabled() && sparkStatistics != null) {
                LOG.info(sparkStatisticsToString(sparkStatistics, sparkJobID));
            }
            LOG.info("Successfully completed Spark job[" + sparkJobID + "] with application ID " + jobID + " and task ID " + getId());
        } else if (rc == 2) {
            // Cancel job if the monitor found job submission timeout.
            // TODO: If the timeout is because of lack of resources in the cluster, we should
            // ideally also cancel the app request here. But w/o facilities from Spark or YARN,
            // it's difficult to do it on hive side alone. See HIVE-12650.
            LOG.debug("Failed to submit Spark job with job handle id " + sparkJobHandleId);
            LOG.info("Failed to submit Spark job for application id " + (Strings.isNullOrEmpty(jobID) ? "UNKNOWN" : jobID));
            killJob();
        } else if (rc == 4) {
            LOG.info("The Spark job or one stage of it has too many tasks" + ". Cancelling Spark job " + sparkJobID + " with application ID " + jobID);
            killJob();
        }
        if (this.jobID == null) {
            this.jobID = sparkJobStatus.getAppID();
        }
        sparkJobStatus.cleanup();
    } catch (Exception e) {
        LOG.error("Failed to execute Spark task \"" + getId() + "\"", e);
        setException(e);
        if (e instanceof HiveException) {
            HiveException he = (HiveException) e;
            rc = he.getCanonicalErrorMsg().getErrorCode();
        } else {
            rc = 1;
        }
    } finally {
        startTime = perfLogger.getEndTime(PerfLogger.SPARK_SUBMIT_TO_RUNNING);
        // In this case, set startTime the same as submitTime.
        if (startTime < submitTime) {
            startTime = submitTime;
        }
        finishTime = perfLogger.getEndTime(PerfLogger.SPARK_RUN_JOB);
        Utilities.clearWork(conf);
        if (sparkSession != null && sparkSessionManager != null) {
            rc = close(rc);
            try {
                sparkSessionManager.returnSession(sparkSession);
            } catch (HiveException ex) {
                LOG.error("Failed to return the session to SessionManager", ex);
            }
        }
    }
    return rc;
}
Also used : SparkSession(org.apache.hadoop.hive.ql.exec.spark.session.SparkSession) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SparkCounters(org.apache.hive.spark.counter.SparkCounters) SparkJobStatus(org.apache.hadoop.hive.ql.exec.spark.status.SparkJobStatus) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) SparkSessionManager(org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionManager) SparkCounter(org.apache.hive.spark.counter.SparkCounter) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException)

Example 2 with SparkCounter

use of org.apache.hive.spark.counter.SparkCounter in project hive by apache.

the class SparkStatisticsBuilder method add.

public SparkStatisticsBuilder add(SparkCounters sparkCounters) {
    for (SparkCounterGroup counterGroup : sparkCounters.getSparkCounterGroups().values()) {
        String groupDisplayName = counterGroup.getGroupDisplayName();
        List<SparkStatistic> statisticList = statisticMap.get(groupDisplayName);
        if (statisticList == null) {
            statisticList = new LinkedList<SparkStatistic>();
            statisticMap.put(groupDisplayName, statisticList);
        }
        for (SparkCounter counter : counterGroup.getSparkCounters().values()) {
            String displayName = counter.getDisplayName();
            statisticList.add(new SparkStatistic(displayName, Long.toString(counter.getValue())));
        }
    }
    return this;
}
Also used : SparkCounterGroup(org.apache.hive.spark.counter.SparkCounterGroup) SparkCounter(org.apache.hive.spark.counter.SparkCounter)

Aggregations

SparkCounter (org.apache.hive.spark.counter.SparkCounter)2 IOException (java.io.IOException)1 SparkSession (org.apache.hadoop.hive.ql.exec.spark.session.SparkSession)1 SparkSessionManager (org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionManager)1 SparkJobStatus (org.apache.hadoop.hive.ql.exec.spark.status.SparkJobStatus)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)1 SparkCounterGroup (org.apache.hive.spark.counter.SparkCounterGroup)1 SparkCounters (org.apache.hive.spark.counter.SparkCounters)1