use of org.apache.pig.tools.pigstats.JobStats in project oozie by apache.
the class PigMain method getHadoopJobIds.
/**
* Get Hadoop Ids through PigStats API
*
* @param pigStats stats object obtained through PigStats API
* @return comma-separated String
*/
protected String getHadoopJobIds(PigStats pigStats) {
StringBuilder sb = new StringBuilder(STRING_BUFFER_SIZE);
String separator = ",";
// comma separated string
try {
PigStats.JobGraph jobGraph = pigStats.getJobGraph();
for (JobStats jobStats : jobGraph) {
String hadoopJobId = jobStats.getJobId();
if (StringUtils.isEmpty(hadoopJobId) || hadoopJobId.trim().equalsIgnoreCase("NULL")) {
continue;
}
if (sb.length() > 0) {
sb.append(separator);
}
sb.append(hadoopJobId);
}
}// Return null if Pig API's are not supported
catch (UnsupportedOperationException uoe) {
return null;
}
return sb.toString();
}
use of org.apache.pig.tools.pigstats.JobStats in project ambrose by twitter.
the class AmbrosePigProgressNotificationListener method jobStartedNotification.
/**
* Called with a job is started. This is the first time that we are notified of a new jobId for a
* launched job. Hence this method binds the jobId to the DAGNode and pushes a status event.
* @param scriptId scriptId of the running script
* @param assignedJobId the jobId assigned to the job started.
*/
@Override
public void jobStartedNotification(String scriptId, String assignedJobId) {
log.info("jobStartedNotification - scriptId " + scriptId + "jobId " + assignedJobId);
// up it's scope and bind the jobId to the DAGNode with the same scope.
for (JobStats jobStats : pigConfig.getJobGraph()) {
if (assignedJobId.equals(jobStats.getJobId())) {
log.info("jobStartedNotification - scope " + jobStats.getName() + " is jobId " + assignedJobId);
DAGNode<PigJob> node = this.dagNodeNameMap.get(jobStats.getName());
if (node == null) {
log.warn("jobStartedNotification - unrecognized operator name found (" + jobStats.getName() + ") for jobId " + assignedJobId);
return;
}
PigJob job = node.getJob();
job.setId(assignedJobId);
mapReduceHelper.addMapReduceJobState(job, pigConfig.getJobClient());
dagNodeJobIdMap.put(job.getId(), node);
AmbroseUtils.pushEvent(statsWriteService, scriptId, new Event.JobStartedEvent(node));
}
}
}
use of org.apache.pig.tools.pigstats.JobStats in project shifu by ShifuML.
the class ModelDataEncodeProcessor method encodeModelData.
@SuppressWarnings("deprecation")
private int encodeModelData(EvalConfig evalConfig) throws IOException {
int status = 0;
RawSourceData.SourceType sourceType = this.modelConfig.getDataSet().getSource();
// clean up output directories
ShifuFileUtils.deleteFile(pathFinder.getEncodeDataPath(evalConfig), sourceType);
// prepare special parameters and execute pig
Map<String, String> paramsMap = new HashMap<String, String>();
paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString());
paramsMap.put("pathRawData", (evalConfig == null) ? modelConfig.getDataSetRawPath() : evalConfig.getDataSet().getDataPath());
paramsMap.put("pathEncodeData", pathFinder.getEncodeDataPath(evalConfig));
paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
paramsMap.put("evalSetName", (evalConfig == null ? TRAINING_DATA_SET : evalConfig.getName()));
paramsMap.put(Constants.IS_COMPRESS, "true");
try {
String encodePigPath = pathFinder.getScriptPath("scripts/EncodeData.pig");
;
PigExecutor.getExecutor().submitJob(modelConfig, encodePigPath, paramsMap);
Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
while (iter.hasNext()) {
JobStats jobStats = iter.next();
if (jobStats.getHadoopCounters() != null && jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) != null) {
long totalValidCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("TOTAL_VALID_COUNT");
// If no basic record counter, check next one
if (totalValidCount == 0L) {
continue;
}
long invalidTagCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("INVALID_TAG");
LOG.info("Total valid records {} after filtering, invalid tag records {}.", totalValidCount, invalidTagCount);
if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
LOG.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
status = 1;
}
}
// only one pig job with such counters, break
break;
}
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
} catch (Throwable e) {
throw new RuntimeException(e);
}
return status;
}
use of org.apache.pig.tools.pigstats.JobStats in project shifu by ShifuML.
the class EvalModelProcessor method runDistScore.
/**
* run pig mode scoring
*
* @param evalConfig
* the name for evaluation
* @throws IOException
* any io exception
*/
@SuppressWarnings("deprecation")
private ScoreStatus runDistScore(EvalConfig evalConfig) throws IOException {
// clean up output directories
SourceType sourceType = evalConfig.getDataSet().getSource();
ShifuFileUtils.deleteFile(pathFinder.getEvalNormalizedPath(evalConfig), sourceType);
ShifuFileUtils.deleteFile(pathFinder.getEvalScorePath(evalConfig), sourceType);
ShifuFileUtils.deleteFile(pathFinder.getEvalPerformancePath(evalConfig), sourceType);
// prepare special parameters and execute pig
Map<String, String> paramsMap = new HashMap<String, String>();
paramsMap.put(Constants.SOURCE_TYPE, sourceType.toString());
paramsMap.put("pathEvalRawData", evalConfig.getDataSet().getDataPath());
paramsMap.put("pathEvalNormalized", pathFinder.getEvalNormalizedPath(evalConfig));
paramsMap.put("pathEvalScore", pathFinder.getEvalScorePath(evalConfig));
paramsMap.put("pathEvalPerformance", pathFinder.getEvalPerformancePath(evalConfig));
paramsMap.put("eval_set_name", evalConfig.getName());
paramsMap.put("delimiter", CommonUtils.escapePigString(evalConfig.getDataSet().getDataDelimiter()));
paramsMap.put("columnIndex", evalConfig.getPerformanceScoreSelector().trim());
paramsMap.put("scale", Environment.getProperty(Constants.SHIFU_SCORE_SCALE, Integer.toString(Scorer.DEFAULT_SCORE_SCALE)));
String expressionsAsString = super.modelConfig.getSegmentFilterExpressionsAsString();
Environment.getProperties().put("shifu.segment.expressions", expressionsAsString);
String pigScript = "scripts/Eval.pig";
Map<String, String> confMap = new HashMap<String, String>();
// max min score folder
String maxMinScoreFolder = ShifuFileUtils.getFileSystemBySourceType(sourceType).makeQualified(new Path("tmp" + File.separator + "maxmin_score_" + System.currentTimeMillis() + "_" + RANDOM.nextLong())).toString();
confMap.put(Constants.SHIFU_EVAL_MAXMIN_SCORE_OUTPUT, maxMinScoreFolder);
if (modelConfig.isClassification() || (isNoSort() && EvalStep.SCORE.equals(this.evalStep))) {
pigScript = "scripts/EvalScore.pig";
}
try {
PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath(pigScript), paramsMap, evalConfig.getDataSet().getSource(), confMap, super.pathFinder);
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
} catch (Throwable e) {
throw new RuntimeException(e);
}
Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
while (iter.hasNext()) {
JobStats jobStats = iter.next();
long evalRecords = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_RECORDS);
LOG.info("Total valid eval records is : {}", evalRecords);
// If no basic record counter, check next one
if (evalRecords == 0L) {
continue;
}
this.evalRecords = evalRecords;
long pigPosTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_POSTAGS);
long pigNegTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_NEGTAGS);
double pigPosWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_WPOSTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d);
double pigNegWeightTags = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.COUNTER_WNEGTAGS) / (Constants.EVAL_COUNTER_WEIGHT_SCALE * 1.0d);
LOG.info("Total positive record count is : {}", pigPosTags);
LOG.info("Total negative record count is : {}", pigNegTags);
LOG.info("Total weighted positive record count is : {}", pigPosWeightTags);
LOG.info("Total weighted negative record count is : {}", pigNegWeightTags);
long totalRunTime = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter(Constants.TOTAL_MODEL_RUNTIME);
LOG.info("Avg SLA for eval model scoring is {} micro seconds", totalRunTime / evalRecords);
double maxScore = Integer.MIN_VALUE;
double minScore = Integer.MAX_VALUE;
if (modelConfig.isRegression()) {
double[] maxMinScores = locateMaxMinScoreFromFile(sourceType, maxMinScoreFolder);
maxScore = maxMinScores[0];
minScore = maxMinScores[1];
LOG.info("Raw max score is {}, raw min score is {}", maxScore, minScore);
ShifuFileUtils.deleteFile(maxMinScoreFolder, sourceType);
}
// only one pig job with such counters, return
return new ScoreStatus(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore, minScore, evalRecords);
}
return null;
}
use of org.apache.pig.tools.pigstats.JobStats in project shifu by ShifuML.
the class NormalizeModelProcessor method runPigNormalize.
/**
* Running pig normalize process
*
* @throws IOException
* any IO exception.
*/
@SuppressWarnings("deprecation")
private void runPigNormalize() throws IOException {
SourceType sourceType = modelConfig.getDataSet().getSource();
ShifuFileUtils.deleteFile(pathFinder.getNormalizedDataPath(), sourceType);
ShifuFileUtils.deleteFile(pathFinder.getSelectedRawDataPath(), sourceType);
Map<String, String> paramsMap = new HashMap<String, String>();
paramsMap.put("sampleRate", modelConfig.getNormalizeSampleRate().toString());
paramsMap.put("sampleNegOnly", ((Boolean) modelConfig.isNormalizeSampleNegOnly()).toString());
paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
paramsMap.put("is_csv", String.valueOf(Boolean.TRUE.toString().equalsIgnoreCase(Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_CSV, Boolean.FALSE.toString()))));
String expressionsAsString = super.modelConfig.getSegmentFilterExpressionsAsString();
Environment.getProperties().put("shifu.segment.expressions", expressionsAsString);
try {
String normPigPath = null;
if (modelConfig.getNormalize().getIsParquet()) {
if (modelConfig.getBasic().getPostTrainOn()) {
normPigPath = pathFinder.getScriptPath("scripts/NormalizeWithParquetAndPostTrain.pig");
} else {
log.info("Post train is disabled by 'postTrainOn=false'.");
normPigPath = pathFinder.getScriptPath("scripts/NormalizeWithParquet.pig");
}
} else {
if (modelConfig.getBasic().getPostTrainOn()) {
// this condition is for comment, no matter post train enabled or not, only norm results will be
// stored since new post train solution no need to prepare data
}
normPigPath = pathFinder.getScriptPath("scripts/Normalize.pig");
}
paramsMap.put(Constants.IS_COMPRESS, "true");
paramsMap.put(Constants.IS_NORM_FOR_CLEAN, "false");
PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap);
Iterator<JobStats> iter = PigStats.get().getJobGraph().iterator();
while (iter.hasNext()) {
JobStats jobStats = iter.next();
if (jobStats.getHadoopCounters() != null && jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER) != null) {
long totalValidCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("TOTAL_VALID_COUNT");
// If no basic record counter, check next one
if (totalValidCount == 0L) {
continue;
}
long invalidTagCount = jobStats.getHadoopCounters().getGroup(Constants.SHIFU_GROUP_COUNTER).getCounter("INVALID_TAG");
log.info("Total valid records {} after filtering, invalid tag records {}.", totalValidCount, invalidTagCount);
if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
log.error("Too many invalid tags, please check you configuration on positive tags and negative tags.");
}
}
// only one pig job with such counters, break
break;
}
if (StringUtils.isNotBlank(modelConfig.getValidationDataSetRawPath())) {
ShifuFileUtils.deleteFile(pathFinder.getNormalizedValidationDataPath(), sourceType);
paramsMap.put(Constants.IS_COMPRESS, "false");
paramsMap.put(Constants.IS_VALIDATION_DATASET, "true");
paramsMap.put(Constants.PATH_RAW_DATA, modelConfig.getValidationDataSetRawPath());
paramsMap.put(Constants.PATH_NORMALIZED_DATA, pathFinder.getNormalizedValidationDataPath());
PigExecutor.getExecutor().submitJob(modelConfig, normPigPath, paramsMap);
}
} catch (IOException e) {
throw new ShifuException(ShifuErrorCode.ERROR_RUNNING_PIG_JOB, e);
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
Aggregations