use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.
the class BasicStatsCalculator method calculateStats.
private void calculateStats() {
sum = 0.0;
squaredSum = 0.0;
if (voList.size() == 0) {
return;
}
int validSize = 0;
for (ValueObject vo : voList) {
Double value = vo.getValue();
if (value.isInfinite() || value.isNaN() || Math.abs(value) > threshold) {
log.warn("Invalid value - " + value);
continue;
}
validSize++;
max = Math.max(max, value);
min = Math.min(min, value);
sum += value;
squaredSum += value * value;
}
// mean and stdDev defaults to NaN
if (validSize <= 1 || sum.isInfinite() || squaredSum.isInfinite()) {
return;
}
// it's ok while the voList is sorted;
setMedian(voList.get(voList.size() / 2).getValue());
mean = sum / validSize;
stdDev = Math.sqrt((squaredSum - (sum * sum) / validSize + EPS) / (validSize - 1));
}
use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.
the class Binning method doEqualPositiveBinning.
/**
* equal bad binning
*/
private void doEqualPositiveBinning() {
int sumBad = 0;
for (int i = 0; i < voSize; i++) {
sumBad += (posTags.contains(voList.get(i).getTag()) ? 1 : 0);
}
int binSize = (int) Math.ceil((double) sumBad / (double) expectNumBins);
int currBin = 0;
// double currBinSumScore = 0;
Integer[] countNeg = new Integer[expectNumBins];
Integer[] countPos = new Integer[expectNumBins];
Double[] countWeightedNeg = new Double[expectNumBins];
Double[] countWeightedPos = new Double[expectNumBins];
countNeg[0] = 0;
countPos[0] = 0;
countWeightedNeg[0] = 0.0;
countWeightedPos[0] = 0.0;
// add first bin (from negative infinite)
this.binBoundary.add(Double.NEGATIVE_INFINITY);
ValueObject vo = null;
double prevData = voList.get(0).getValue();
// For each Variable
for (int i = 0; i < voSize; i++) {
vo = voList.get(i);
double currData = vo.getValue();
// current bin is full
if (countPos[currBin] >= binSize) {
// still have some negative leftover
if (currBin == expectNumBins - 1 && i != voList.size() - 1) {
continue;
}
// and data is different from the previous pair
if (i == 0 || (mergeEnabled == true && Math.abs(currData - prevData) > EPS) || mergeEnabled == false) {
// MOVE to the new bin, if not the last vo
if (i == voList.size() - 1) {
break;
}
currBin++;
this.binBoundary.add(currData);
// AFTER move to the new bin
// currBinSumScore = 0;
countNeg[currBin] = 0;
countPos[currBin] = 0;
countWeightedNeg[currBin] = 0.0;
countWeightedPos[currBin] = 0.0;
}
}
// increment the counter of the current bin
if (negTags.contains(voList.get(i).getTag())) {
countNeg[currBin]++;
countWeightedNeg[currBin] += vo.getWeight();
} else {
countPos[currBin]++;
countWeightedPos[currBin] += vo.getWeight();
}
prevData = currData;
}
// Finishing...
// this.binBoundary.add(vo.getNumericalData());
// this.binAvgScore.add(currBinSumScore / (countNeg[currBin] +
// countPos[currBin]));
this.actualNumBins = currBin + 1;
for (int i = 0; i < this.actualNumBins; i++) {
binCountNeg.add(countNeg[i]);
binCountPos.add(countPos[i]);
binAvgScore.add(0);
binPosCaseRate.add((double) countPos[i] / (countPos[i] + countNeg[i]));
this.binWeightedNeg.add(countWeightedNeg[i]);
this.binWeightedPos.add(countWeightedPos[i]);
}
}
use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.
the class BinningTest method autoTest.
@Test
public void autoTest() {
Set<String> categorySet = new HashSet<String>();
for (int i = 0; i < 3; i++) {
ValueObject vo = new ValueObject();
// vo.setValue(rdm.nextDouble());
String input = Integer.toString(rdm.nextInt(100));
categorySet.add(input);
vo.setRaw(input);
vo.setTag(Integer.toString(rdm.nextInt(2)));
vo.setWeight(rdm.nextDouble());
voList.add(vo);
}
List<String> posTags = new ArrayList<String>();
posTags.add("1");
List<String> negTag = new ArrayList<String>();
negTag.add("0");
binA = new Binning(posTags, negTag, BinningDataType.Auto, voList);
binA.setMaxNumOfBins(6);
binA.setBinningMethod(BinningMethod.EqualPositive);
binA.setAutoTypeThreshold(1002);
binA.setMergeEnabled(true);
binA.doBinning();
binA.setBinningMethod(BinningMethod.EqualTotal);
binA.doBinning();
binA.setBinningMethod(BinningMethod.EqualInterval);
binA.doBinning();
// TODO test case
}
use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.
the class DataPrepareWorker method convertRawDataIntoValueObject.
/*
* Convert raw data into @ValueObject for calculating stats
*
* @param rawDataList
* - raw data for training
* @param columnVoListMap
* <column-id --> @ValueObject list>
* @throws ShifuException
* if the data field length is not equal header length
*/
private DataPrepareStatsResult convertRawDataIntoValueObject(List<String> rawDataList, Map<Integer, List<ValueObject>> columnVoListMap) throws ShifuException {
double sampleRate = modelConfig.getBinningSampleRate();
long total = 0l;
Map<Integer, Long> missingMap = new HashMap<Integer, Long>();
for (String line : rawDataList) {
total++;
String[] raw = CommonUtils.split(line, modelConfig.getDataSetDelimiter());
if (raw.length != columnConfigList.size()) {
log.error("Expected Columns: " + columnConfigList.size() + ", but got: " + raw.length);
throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
}
String tag = CommonUtils.trimTag(raw[targetColumnNum]);
if (modelConfig.isBinningSampleNegOnly()) {
if (modelConfig.getNegTags().contains(tag) && random.nextDouble() > sampleRate) {
continue;
}
} else {
if (random.nextDouble() > sampleRate) {
continue;
}
}
for (int i = 0; i < raw.length; i++) {
if (!columnNumToActorMap.containsKey(i)) {
// ignore non-used columns
continue;
}
ValueObject vo = new ValueObject();
if (i >= columnConfigList.size()) {
log.error("The input size is longer than expected, need to check your data");
continue;
}
ColumnConfig config = columnConfigList.get(i);
if (config.isNumerical()) {
// NUMERICAL
try {
vo.setValue(Double.valueOf(raw[i].trim()));
vo.setRaw(null);
} catch (Exception e) {
log.debug("Column " + config.getColumnNum() + ": " + config.getColumnName() + " is expected to be NUMERICAL, however received: " + raw[i]);
incMap(i, missingMap);
continue;
}
} else if (config.isCategorical()) {
// CATEGORICAL
if (raw[i] == null || StringUtils.isEmpty(raw[i]) || modelConfig.getDataSet().getMissingOrInvalidValues().contains(raw[i].toLowerCase().trim())) {
incMap(i, missingMap);
}
vo.setRaw(raw[i].trim());
vo.setValue(null);
} else {
// AUTO TYPE
try {
vo.setValue(Double.valueOf(raw[i]));
vo.setRaw(null);
} catch (Exception e) {
incMap(i, missingMap);
vo.setRaw(raw[i]);
vo.setValue(null);
}
}
if (this.weightedColumnNum != -1) {
try {
vo.setWeight(Double.valueOf(raw[weightedColumnNum]));
} catch (NumberFormatException e) {
vo.setWeight(1.0);
}
vo.setWeight(1.0);
}
vo.setTag(tag);
List<ValueObject> voList = columnVoListMap.get(i);
if (voList == null) {
voList = new ArrayList<ValueObject>();
columnVoListMap.put(i, voList);
}
voList.add(vo);
}
}
DataPrepareStatsResult rt = new DataPrepareStatsResult(total, missingMap);
return rt;
}
use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.
the class BinningTest method numericalTest.
@Test
public void numericalTest() {
for (int i = 0; i < 5000; i++) {
ValueObject vo = new ValueObject();
vo.setValue(rdm.nextDouble());
vo.setRaw(Integer.toString(rdm.nextInt(100)));
vo.setTag(Integer.toString(rdm.nextInt(2)));
vo.setWeight(rdm.nextDouble());
voList.add(vo);
}
List<String> posTags = new ArrayList<String>();
posTags.add("1");
List<String> negTag = new ArrayList<String>();
negTag.add("0");
binN = new Binning(posTags, negTag, BinningDataType.Numerical, voList);
binN.setMaxNumOfBins(numBin);
binN.setBinningMethod(BinningMethod.EqualPositive);
binN.setAutoTypeThreshold(3);
binN.setMergeEnabled(true);
binN.doBinning();
binN.setBinningMethod(BinningMethod.EqualTotal);
binN.doBinning();
binN.setBinningMethod(BinningMethod.EqualInterval);
binN.doBinning();
Assert.assertEquals(binN.getNumBins(), numBin);
}
Aggregations