use of com.google.common.base.Splitter in project shifu by ShifuML.
the class MapReducerStatsWorker method runPSI.
/**
* Calculate the PSI
*
* @throws IOException
* in scanners read exception
*/
public void runPSI() throws IOException {
log.info("Run PSI to use {} to compute the PSI ", modelConfig.getPsiColumnName());
ColumnConfig columnConfig = CommonUtils.findColumnConfigByName(columnConfigList, modelConfig.getPsiColumnName());
if (columnConfig == null || (!columnConfig.isMeta() && !columnConfig.isCategorical())) {
log.warn("Unable to use the PSI column {} specify in ModelConfig to compute PSI\n" + "neither meta nor categorical type", columnConfig != null ? columnConfig.getColumnName() : "unknown");
return;
}
log.info("Start to use {} to compute the PSI ", columnConfig.getColumnName());
Map<String, String> paramsMap = new HashMap<>();
paramsMap.put("delimiter", CommonUtils.escapePigString(modelConfig.getDataSetDelimiter()));
paramsMap.put("PSIColumn", modelConfig.getPsiColumnName().trim());
paramsMap.put("column_parallel", Integer.toString(columnConfigList.size() / 10));
paramsMap.put("value_index", "2");
PigExecutor.getExecutor().submitJob(modelConfig, pathFinder.getScriptPath("scripts/PSI.pig"), paramsMap);
List<Scanner> scanners = ShifuFileUtils.getDataScanners(pathFinder.getPSIInfoPath(), modelConfig.getDataSet().getSource());
if (CollectionUtils.isEmpty(scanners)) {
log.info("The PSI got failure during the computation");
return;
}
String delimiter = Environment.getProperty(Constants.SHIFU_OUTPUT_DATA_DELIMITER, Constants.DEFAULT_DELIMITER);
Splitter splitter = Splitter.on(delimiter).trimResults();
List<String> unitStats = new ArrayList<String>(this.columnConfigList.size());
for (Scanner scanner : scanners) {
while (scanner.hasNext()) {
// String[] output = scanner.nextLine().trim().split("\\|");
String[] output = Lists.newArrayList(splitter.split(scanner.nextLine())).toArray(new String[0]);
try {
int columnNum = Integer.parseInt(output[0]);
ColumnConfig config = this.columnConfigList.get(columnNum);
config.setPSI(Double.parseDouble(output[1]));
unitStats.add(output[0] + "|" + output[2]);
// config.setUnitStats(
// Arrays.asList(StringUtils.split(output[2], CalculateStatsUDF.CATEGORY_VAL_SEPARATOR)));
} catch (Exception e) {
log.error("error in parsing", e);
}
}
// close scanner
IOUtils.closeQuietly(scanner);
}
// write unit stat into a temporary file
ShifuFileUtils.createDirIfNotExists(new SourceFile(Constants.TMP, RawSourceData.SourceType.LOCAL));
String ccUnitStatsFile = this.pathFinder.getColumnConfigUnitStatsPath();
ShifuFileUtils.writeLines(unitStats, ccUnitStatsFile, RawSourceData.SourceType.LOCAL);
log.info("The Unit Stats is stored in - {}.", ccUnitStatsFile);
log.info("Run PSI - done.");
}
use of com.google.common.base.Splitter in project shifu by ShifuML.
the class ConfusionMatrix method bufferedComputeConfusionMatrixAndPerformance.
public PerformanceResult bufferedComputeConfusionMatrixAndPerformance(long pigPosTags, long pigNegTags, double pigPosWeightTags, double pigNegWeightTags, long records, double maxPScore, double minPScore, String scoreDataPath, String evalPerformancePath, boolean isPrint, boolean isGenerateChart, int targetColumnIndex, int scoreColumnIndex, int weightColumnIndex, boolean isUseMaxMinScore) throws IOException {
// 1. compute maxScore and minScore in case some cases score are not in [0, 1]
double maxScore = 1d * scoreScale, minScore = 0d;
if (isGBTNeedConvertScore()) {
// if need convert to [0, 1], just keep max score to 1 and min score to 0 without doing anything
} else {
if (isUseMaxMinScore) {
// TODO some cases maxPScore is already scaled, how to fix that issue
maxScore = maxPScore;
minScore = minPScore;
} else {
// otherwise, keep [0, 1]
}
}
LOG.info("{} Transformed (scale included) max score is {}, transformed min score is {}", evalConfig.getGbtScoreConvertStrategy(), maxScore, minScore);
SourceType sourceType = evalConfig.getDataSet().getSource();
List<Scanner> scanners = ShifuFileUtils.getDataScanners(scoreDataPath, sourceType);
LOG.info("Number of score files is {} in eval {}.", scanners.size(), evalConfig.getName());
int numBucket = evalConfig.getPerformanceBucketNum();
boolean hasWeight = StringUtils.isNotBlank(evalConfig.getDataSet().getWeightColumnName());
boolean isDir = ShifuFileUtils.isDir(pathFinder.getEvalScorePath(evalConfig, sourceType), sourceType);
List<PerformanceObject> FPRList = new ArrayList<PerformanceObject>(numBucket + 1);
List<PerformanceObject> catchRateList = new ArrayList<PerformanceObject>(numBucket + 1);
List<PerformanceObject> gainList = new ArrayList<PerformanceObject>(numBucket + 1);
List<PerformanceObject> modelScoreList = new ArrayList<PerformanceObject>(numBucket + 1);
List<PerformanceObject> FPRWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
List<PerformanceObject> catchRateWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
List<PerformanceObject> gainWeightList = new ArrayList<PerformanceObject>(numBucket + 1);
double binScore = (maxScore - minScore) * 1d / numBucket, binCapacity = 1.0 / numBucket, scoreBinCount = 0, scoreBinWeigthedCount = 0;
int fpBin = 1, tpBin = 1, gainBin = 1, fpWeightBin = 1, tpWeightBin = 1, gainWeightBin = 1, modelScoreBin = 1;
long index = 0, cnt = 0, invalidTargetCnt = 0, invalidWgtCnt = 0;
ConfusionMatrixObject prevCmo = buildInitalCmo(pigPosTags, pigNegTags, pigPosWeightTags, pigNegWeightTags, maxScore);
PerformanceObject po = buildFirstPO(prevCmo);
FPRList.add(po);
catchRateList.add(po);
gainList.add(po);
FPRWeightList.add(po);
catchRateWeightList.add(po);
gainWeightList.add(po);
modelScoreList.add(po);
boolean isGBTScoreHalfCutoffStreategy = isGBTScoreHalfCutoffStreategy();
boolean isGBTScoreMaxMinScaleStreategy = isGBTScoreMaxMinScaleStreategy();
Splitter splitter = Splitter.on(delimiter).trimResults();
for (Scanner scanner : scanners) {
while (scanner.hasNext()) {
if ((++cnt) % 100000L == 0L) {
LOG.info("Loaded {} records.", cnt);
}
if ((!isDir) && cnt == 1) {
// if the evaluation score file is the local file, skip the first line since we add
continue;
}
// score is separated by default delimiter in our pig output format
String[] raw = Lists.newArrayList(splitter.split(scanner.nextLine())).toArray(new String[0]);
// tag check
String tag = raw[targetColumnIndex];
if (StringUtils.isBlank(tag) || (!posTags.contains(tag) && !negTags.contains(tag))) {
invalidTargetCnt += 1;
continue;
}
double weight = 1d;
// if has weight
if (weightColumnIndex > 0) {
try {
weight = Double.parseDouble(raw[weightColumnIndex]);
} catch (NumberFormatException e) {
invalidWgtCnt += 1;
}
if (weight < 0d) {
invalidWgtCnt += 1;
weight = 1d;
}
}
double score = 0.0;
try {
score = Double.parseDouble(raw[scoreColumnIndex]);
} catch (NumberFormatException e) {
// user set the score column wrong ?
if (Math.random() < 0.05) {
LOG.warn("The score column - {} is not number. Is score column set correctly?", raw[scoreColumnIndex]);
}
continue;
}
scoreBinCount += 1;
scoreBinWeigthedCount += weight;
ConfusionMatrixObject cmo = new ConfusionMatrixObject(prevCmo);
if (posTags.contains(tag)) {
// Positive Instance
cmo.setTp(cmo.getTp() + 1);
cmo.setFn(cmo.getFn() - 1);
cmo.setWeightedTp(cmo.getWeightedTp() + weight * 1.0);
cmo.setWeightedFn(cmo.getWeightedFn() - weight * 1.0);
} else {
// Negative Instance
cmo.setFp(cmo.getFp() + 1);
cmo.setTn(cmo.getTn() - 1);
cmo.setWeightedFp(cmo.getWeightedFp() + weight * 1.0);
cmo.setWeightedTn(cmo.getWeightedTn() - weight * 1.0);
}
if (isGBTScoreHalfCutoffStreategy) {
// use max min scale to rescale to [0, 1]
if (score < 0d) {
score = 0d;
}
score = ((score - 0) * scoreScale) / (maxPScore - 0);
} else if (isGBTScoreMaxMinScaleStreategy) {
// use max min scaler to make score in [0, 1], don't foget to time scoreScale
score = ((score - minPScore) * scoreScale) / (maxPScore - minPScore);
} else {
// do nothing, use current score
}
cmo.setScore(Double.parseDouble(SCORE_FORMAT.format(score)));
ConfusionMatrixObject object = cmo;
po = PerformanceEvaluator.setPerformanceObject(object);
if (po.fpr >= fpBin * binCapacity) {
po.binNum = fpBin++;
FPRList.add(po);
}
if (po.recall >= tpBin * binCapacity) {
po.binNum = tpBin++;
catchRateList.add(po);
}
// prevent 99%
double validRecordCnt = (double) (index + 1);
if (validRecordCnt / (pigPosTags + pigNegTags) >= gainBin * binCapacity) {
po.binNum = gainBin++;
gainList.add(po);
}
if (po.weightedFpr >= fpWeightBin * binCapacity) {
po.binNum = fpWeightBin++;
FPRWeightList.add(po);
}
if (po.weightedRecall >= tpWeightBin * binCapacity) {
po.binNum = tpWeightBin++;
catchRateWeightList.add(po);
}
if ((object.getWeightedTp() + object.getWeightedFp()) / object.getWeightedTotal() >= gainWeightBin * binCapacity) {
po.binNum = gainWeightBin++;
gainWeightList.add(po);
}
if ((maxScore - (modelScoreBin * binScore)) >= score) {
po.binNum = modelScoreBin++;
po.scoreCount = scoreBinCount;
po.scoreWgtCount = scoreBinWeigthedCount;
// System.out.println("score count is " + scoreBinCount);
// reset to 0 for next bin score cnt stats
scoreBinCount = scoreBinWeigthedCount = 0;
modelScoreList.add(po);
}
index += 1;
prevCmo = cmo;
}
scanner.close();
}
LOG.info("Totally loading {} records with invalid target records {} and invalid weight records {} in eval {}.", cnt, invalidTargetCnt, invalidWgtCnt, evalConfig.getName());
PerformanceResult result = buildPerfResult(FPRList, catchRateList, gainList, modelScoreList, FPRWeightList, catchRateWeightList, gainWeightList);
synchronized (this.lock) {
if (isPrint) {
PerformanceEvaluator.logResult(FPRList, "Bucketing False Positive Rate");
if (hasWeight) {
PerformanceEvaluator.logResult(FPRWeightList, "Bucketing Weighted False Positive Rate");
}
PerformanceEvaluator.logResult(catchRateList, "Bucketing Catch Rate");
if (hasWeight) {
PerformanceEvaluator.logResult(catchRateWeightList, "Bucketing Weighted Catch Rate");
}
PerformanceEvaluator.logResult(gainList, "Bucketing Action Rate");
if (hasWeight) {
PerformanceEvaluator.logResult(gainWeightList, "Bucketing Weighted Action Rate");
}
PerformanceEvaluator.logAucResult(result, hasWeight);
}
writePerResult2File(evalPerformancePath, result);
if (isGenerateChart) {
generateChartAndJsonPerfFiles(hasWeight, result);
}
}
if (cnt == 0) {
LOG.error("No score read, the EvalScore did not genernate or is null file");
throw new ShifuException(ShifuErrorCode.ERROR_EVALSCORE);
}
return result;
}
use of com.google.common.base.Splitter in project symja_android_library by axkr.
the class StringMapFunctions method tokenizeAndSort.
default StringColumn tokenizeAndSort(String separator) {
StringColumn newColumn = StringColumn.create(name() + "[sorted]", this.size());
for (int r = 0; r < size(); r++) {
String value = getString(r);
Splitter splitter = Splitter.on(separator);
splitter = splitter.trimResults();
splitter = splitter.omitEmptyStrings();
List<String> tokens = new ArrayList<>(splitter.splitToList(value));
Collections.sort(tokens);
value = String.join(separator, tokens);
newColumn.set(r, value);
}
return newColumn;
}
use of com.google.common.base.Splitter in project symja_android_library by axkr.
the class StringMapFunctions method tokens.
/**
* Returns a column of arbitrary size containing each token in this column, where a token is
* defined using the given separator.
*
* <p>NOTE: Unlike other map functions, this method produces a column whose size may be different
* from the source, so they cannot safely be combined in a table.
*
* @param separator the delimiter used in the tokenizing operation
* @return a new column
*/
default StringColumn tokens(String separator) {
StringColumn newColumn = StringColumn.create(name() + "[token count]");
for (int r = 0; r < size(); r++) {
String value = getString(r);
Splitter splitter = Splitter.on(separator);
splitter = splitter.trimResults();
splitter = splitter.omitEmptyStrings();
List<String> tokens = new ArrayList<>(splitter.splitToList(value));
for (String token : tokens) {
newColumn.append(token);
}
}
return newColumn;
}
use of com.google.common.base.Splitter in project symja_android_library by axkr.
the class StringMapFunctions method tokenizeAndRemoveDuplicates.
default StringColumn tokenizeAndRemoveDuplicates(String separator) {
StringColumn newColumn = StringColumn.create(name() + "[without duplicates]", this.size());
for (int r = 0; r < size(); r++) {
String value = getString(r);
Splitter splitter = Splitter.on(separator);
splitter = splitter.trimResults();
splitter = splitter.omitEmptyStrings();
List<String> tokens = new ArrayList<>(splitter.splitToList(value));
String result = tokens.stream().distinct().collect(Collectors.joining(separator));
newColumn.set(r, result);
}
return newColumn;
}
Aggregations