Search in sources :

Example 6 with JobStats

use of org.apache.pig.tools.pigstats.JobStats in project oozie by apache.

the class PigMain method getHadoopJobIds.

/**
 * Get Hadoop Ids through PigStats API
 *
 * @param pigStats stats object obtained through PigStats API
 * @return comma-separated String
 */
protected String getHadoopJobIds(PigStats pigStats) {
    StringBuilder sb = new StringBuilder(STRING_BUFFER_SIZE);
    String separator = ",";
    // comma separated string
    try {
        PigStats.JobGraph jobGraph = pigStats.getJobGraph();
        for (JobStats jobStats : jobGraph) {
            String hadoopJobId = jobStats.getJobId();
            if (StringUtils.isEmpty(hadoopJobId) || hadoopJobId.trim().equalsIgnoreCase("NULL")) {
                continue;
            }
            if (sb.length() > 0) {
                sb.append(separator);
            }
            sb.append(hadoopJobId);
        }
    }// Return null if Pig API's are not supported
     catch (UnsupportedOperationException uoe) {
        return null;
    }
    return sb.toString();
}
Also used : PigStats(org.apache.pig.tools.pigstats.PigStats) JobStats(org.apache.pig.tools.pigstats.JobStats)

Example 7 with JobStats

use of org.apache.pig.tools.pigstats.JobStats in project ambrose by twitter.

the class AmbrosePigProgressNotificationListener method jobStartedNotification.

/**
 * Called with a job is started. This is the first time that we are notified of a new jobId for a
 * launched job. Hence this method binds the jobId to the DAGNode and pushes a status event.
 * @param scriptId scriptId of the running script
 * @param assignedJobId the jobId assigned to the job started.
 */
@Override
public void jobStartedNotification(String scriptId, String assignedJobId) {
    log.info("jobStartedNotification - scriptId " + scriptId + "jobId " + assignedJobId);
    // up it's scope and bind the jobId to the DAGNode with the same scope.
    for (JobStats jobStats : pigConfig.getJobGraph()) {
        if (assignedJobId.equals(jobStats.getJobId())) {
            log.info("jobStartedNotification - scope " + jobStats.getName() + " is jobId " + assignedJobId);
            DAGNode<PigJob> node = this.dagNodeNameMap.get(jobStats.getName());
            if (node == null) {
                log.warn("jobStartedNotification - unrecognized operator name found (" + jobStats.getName() + ") for jobId " + assignedJobId);
                return;
            }
            PigJob job = node.getJob();
            job.setId(assignedJobId);
            mapReduceHelper.addMapReduceJobState(job, pigConfig.getJobClient());
            dagNodeJobIdMap.put(job.getId(), node);
            AmbroseUtils.pushEvent(statsWriteService, scriptId, new Event.JobStartedEvent(node));
        }
    }
}
Also used : Event(com.twitter.ambrose.model.Event) JobStats(org.apache.pig.tools.pigstats.JobStats)

Example 8 with JobStats

use of org.apache.pig.tools.pigstats.JobStats in project shifu by ShifuML.

the class ModelDataEncodeProcessor method encodeModelData.

@SuppressWarnings("deprecation")
private int encodeModelData(EvalConfig evalConfig) throws IOException {
    int status = 0;
    RawSourceData.SourceType sourceType = this.modelConfig.getDataSet().getSource();
    // clean up output directories
    ShifuFileUtils.deleteFile(pathFinder.getEncodeDataPath(evalConfig), sourceType);
    // prepare special parameters and execute pig
    Map<String, String> paramsMap = new HashMap<String, String>();
    paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString());
    paramsMap.put("pathRawData", (evalConfig == null) ? modelConfig.getDataSetRawPath() : evalConfig.getDataSet().getDataPath());
    paramsMap.put("pathEncodeData", pathFinder.getEncodeDataPath(evalConfig));
    paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
    paramsMap.put("evalSetName", (evalConfig == null ? TRAINING_DATA_SET : evalConfig.getName()));
    paramsMap.put(Constants.IS_COMPRESS, "true");
    try {
        String encodePigPath = pathFinder.getScriptPath("scripts/EncodeData.pig");
        ;
        PigExecutor.getExecutor().submitJob(modelConfig, encodePigPath, paramsMap);
        Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
        while (iter.hasNext()) {
            JobStats jobStats = iter.next();
            if (jobStats.getHadoopCounters() != null && jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) != null) {
                long totalValidCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("TOTAL_VALID_COUNT");
                // If no basic record counter, check next one
                if (totalValidCount == 0L) {
                    continue;
                }
                long invalidTagCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("INVALID_TAG");
                LOG.info("Total valid records {} after filtering, invalid tag records {}.", totalValidCount, invalidTagCount);
                if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
                    LOG.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
                    status = 1;
                }
            }
            // only one pig job with such counters, break
            break;
        }
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
    } catch (Throwable e) {
        throw new RuntimeException(e);
    }
    return status;
}
Also used : SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) IOException(java.io.IOException) RawSourceData(ml.shifu.shifu.container.obj.RawSourceData) JobStats(org.apache.pig.tools.pigstats.JobStats) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 9 with JobStats

use of org.apache.pig.tools.pigstats.JobStats in project shifu by ShifuML.

the class EvalModelProcessor method runDistScore.

/**
 * run pig mode scoring
 *
 * @param evalConfig
 *            the name for evaluation
 * @throws IOException
 *             any io exception
 */
@SuppressWarnings("deprecation")
private ScoreStatus runDistScore(EvalConfig evalConfig) throws IOException {
    // clean up output directories
    SourceType sourceType = evalConfig.getDataSet().getSource();
    ShifuFileUtils.deleteFile(pathFinder.getEvalNormalizedPath(evalConfig), sourceType);
    ShifuFileUtils.deleteFile(pathFinder.getEvalScorePath(evalConfig), sourceType);
    ShifuFileUtils.deleteFile(pathFinder.getEvalPerformancePath(evalConfig), sourceType);
    // prepare special parameters and execute pig
    Map<String, String> paramsMap = new HashMap<String, String>();
    paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString());
    paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath());
    paramsMap.put("pathEvalNormalized", pathFinder.getEvalNormalizedPath(evalConfig));
    paramsMap.put("pathEvalScore", pathFinder.getEvalScorePath(evalConfig));
    paramsMap.put("pathEvalPerformance", pathFinder.getEvalPerformancePath(evalConfig));
    paramsMap.put("eval_set_name", evalConfig.getName());
    paramsMap.put("delimiter", CommonUtils.escapePigString(evalConfig.getDataSet().getDataDelimiter()));
    paramsMap.put("columnIndex", evalConfig.getPerformanceScoreSelector().trim());
    paramsMap.put("scale", Environment.getProperty(Constants.SHIFU_SCORE_SCALE, Integer.toString(Scorer.DEFAULT_SCORE_SCALE)));
    String expressionsAsString = super.modelConfig.getSegmentFilterExpressionsAsString();
    Environment.getProperties().put("shifu.segment.expressions", expressionsAsString);
    String pigScript = "scripts/Eval.pig";
    Map<String, String> confMap = new HashMap<String, String>();
    // max min score folder
    String maxMinScoreFolder = ShifuFileUtils.getFileSystemBySourceType(sourceType).makeQualified(new Path("tmp" + File.separator + "maxmin_score_" + System.currentTimeMillis() + "_" + RANDOM.nextLong())).toString();
    confMap.put(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT, maxMinScoreFolder);
    if (modelConfig.isClassification() || (isNoSort() && EvalStep.SCORE.equals(this.evalStep))) {
        pigScript = "scripts/EvalScore.pig";
    }
    try {
        PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource(), confMap, super.pathFinder);
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
    } catch (Throwable e) {
        throw new RuntimeException(e);
    }
    Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
    while (iter.hasNext()) {
        JobStats jobStats = iter.next();
        long evalRecords = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_RECORDS);
        LOG.info("Total valid eval records is : {}", evalRecords);
        // If no basic record counter, check next one
        if (evalRecords == 0L) {
            continue;
        }
        this.evalRecords = evalRecords;
        long pigPosTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_POSTAGS);
        long pigNegTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_NEGTAGS);
        double pigPosWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_WPOSTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d);
        double pigNegWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_WNEGTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d);
        LOG.info("Total positive record count is : {}", pigPosTags);
        LOG.info("Total negative record count is : {}", pigNegTags);
        LOG.info("Total weighted positive record count is : {}", pigPosWeightTags);
        LOG.info("Total weighted negative record count is : {}", pigNegWeightTags);
        long totalRunTime = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.TOTAL_MODEL_RUNTIME);
        LOG.info("Avg SLA for eval model scoring is {} micro seconds", totalRunTime / evalRecords);
        double maxScore = Integer.MIN_VALUE;
        double minScore = Integer.MAX_VALUE;
        if (modelConfig.isRegression()) {
            double[] maxMinScores = locateMaxMinScoreFromFile(sourceType, maxMinScoreFolder);
            maxScore = maxMinScores[0];
            minScore = maxMinScores[1];
            LOG.info("Raw max score is {}, raw min score is {}", maxScore, minScore);
            ShifuFileUtils.deleteFile(maxMinScoreFolder, sourceType);
        }
        // only one pig job with such counters, return
        return new ScoreStatus(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore, minScore, evalRecords);
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) IOException(java.io.IOException) JobStats(org.apache.pig.tools.pigstats.JobStats) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 10 with JobStats

use of org.apache.pig.tools.pigstats.JobStats in project shifu by ShifuML.

the class NormalizeModelProcessor method runPigNormalize.

/**
 * Running pig normalize process
 *
 * @throws IOException
 *             any IO exception.
 */
@SuppressWarnings("deprecation")
private void runPigNormalize() throws IOException {
    SourceType sourceType = modelConfig.getDataSet().getSource();
    ShifuFileUtils.deleteFile(pathFinder.getNormalizedDataPath(), sourceType);
    ShifuFileUtils.deleteFile(pathFinder.getSelectedRawDataPath(), sourceType);
    Map<String, String> paramsMap = new HashMap<String, String>();
    paramsMap.put("sampleRate", modelConfig.getNormalizeSampleRate().toString());
    paramsMap.put("sampleNegOnly", ((Boolean) modelConfig.isNormalizeSampleNegOnly()).toString());
    paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
    paramsMap.put("is_csv", String.valueOf(Boolean.TRUE.toString().equalsIgnoreCase(Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_CSV, Boolean.FALSE.toString()))));
    String expressionsAsString = super.modelConfig.getSegmentFilterExpressionsAsString();
    Environment.getProperties().put("shifu.segment.expressions", expressionsAsString);
    try {
        String normPigPath = null;
        if (modelConfig.getNormalize().getIsParquet()) {
            if (modelConfig.getBasic().getPostTrainOn()) {
                normPigPath = pathFinder.getScriptPath("scripts/NormalizeWithParquetAndPostTrain.pig");
            } else {
                log.info("Post train is disabled by 'postTrainOn=false'.");
                normPigPath = pathFinder.getScriptPath("scripts/NormalizeWithParquet.pig");
            }
        } else {
            if (modelConfig.getBasic().getPostTrainOn()) {
            // this condition is for comment, no matter post train enabled or not, only norm results will be
            // stored since new post train solution no need to prepare data
            }
            normPigPath = pathFinder.getScriptPath("scripts/Normalize.pig");
        }
        paramsMap.put(Constants.IS_COMPRESS, "true");
        paramsMap.put(Constants.IS_NORM_FOR_CLEAN, "false");
        PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap);
        Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
        while (iter.hasNext()) {
            JobStats jobStats = iter.next();
            if (jobStats.getHadoopCounters() != null && jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) != null) {
                long totalValidCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("TOTAL_VALID_COUNT");
                // If no basic record counter, check next one
                if (totalValidCount == 0L) {
                    continue;
                }
                long invalidTagCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("INVALID_TAG");
                log.info("Total valid records {} after filtering, invalid tag records {}.", totalValidCount, invalidTagCount);
                if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
                    log.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
                }
            }
            // only one pig job with such counters, break
            break;
        }
        if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
            ShifuFileUtils.deleteFile(pathFinder.getNormalizedValidationDataPath(), sourceType);
            paramsMap.put(Constants.IS_COMPRESS, "false");
            paramsMap.put(Constants.IS_VALIDATION_DATASET, "true");
            paramsMap.put(Constants.PATH_RAW_DATA, modelConfig.getValidationDataSetRawPath());
            paramsMap.put(Constants.PATH_NORMALIZED_DATA, pathFinder.getNormalizedValidationDataPath());
            PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap);
        }
    } catch (IOException e) {
        throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
    } catch (Throwable e) {
        throw new RuntimeException(e);
    }
}
Also used : HashMap(java.util.HashMap) SourceType(ml.shifu.shifu.container.obj.RawSourceData.SourceType) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException) JobStats(org.apache.pig.tools.pigstats.JobStats)

Aggregations

JobStats (org.apache.pig.tools.pigstats.JobStats)10 IOException (java.io.IOException)6 PigStats (org.apache.pig.tools.pigstats.PigStats)5 SourceType (ml.shifu.shifu.container.obj.RawSourceData.SourceType)4 ShifuException (ml.shifu.shifu.exception.ShifuException)4 HashMap (java.util.HashMap)3 MRJobStats (org.apache.pig.tools.pigstats.mapreduce.MRJobStats)3 Field (java.lang.reflect.Field)2 Path (org.apache.hadoop.fs.Path)2 SimplePigStats (org.apache.pig.tools.pigstats.mapreduce.SimplePigStats)2 Event (com.twitter.ambrose.model.Event)1 VespaCounters (com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters)1 SimpleDateFormat (java.text.SimpleDateFormat)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 RawSourceData (ml.shifu.shifu.container.obj.RawSourceData)1 Counters (org.apache.hadoop.mapred.Counters)1 PigServer (org.apache.pig.PigServer)1 ExecJob (org.apache.pig.backend.executionengine.ExecJob)1 InputStats (org.apache.pig.tools.pigstats.InputStats)1