Search in sources :

Example 1 with ValueObject

use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.

the class BasicStatsCalculator method calculateStats.

private void calculateStats() {
    sum = 0.0;
    squaredSum = 0.0;
    if (voList.size() == 0) {
        return;
    }
    int validSize = 0;
    for (ValueObject vo : voList) {
        Double value = vo.getValue();
        if (value.isInfinite() || value.isNaN() || Math.abs(value) > threshold) {
            log.warn("Invalid value - " + value);
            continue;
        }
        validSize++;
        max = Math.max(max, value);
        min = Math.min(min, value);
        sum += value;
        squaredSum += value * value;
    }
    // mean and stdDev defaults to NaN
    if (validSize <= 1 || sum.isInfinite() || squaredSum.isInfinite()) {
        return;
    }
    // it's ok while the voList is sorted;
    setMedian(voList.get(voList.size() / 2).getValue());
    mean = sum / validSize;
    stdDev = Math.sqrt((squaredSum - (sum * sum) / validSize + EPS) / (validSize - 1));
}
Also used : ValueObject(ml.shifu.shifu.container.ValueObject)

Example 2 with ValueObject

use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.

the class Binning method doEqualPositiveBinning.

/**
 * equal bad binning
 */
private void doEqualPositiveBinning() {
    int sumBad = 0;
    for (int i = 0; i < voSize; i++) {
        sumBad += (posTags.contains(voList.get(i).getTag()) ? 1 : 0);
    }
    int binSize = (int) Math.ceil((double) sumBad / (double) expectNumBins);
    int currBin = 0;
    // double currBinSumScore = 0;
    Integer[] countNeg = new Integer[expectNumBins];
    Integer[] countPos = new Integer[expectNumBins];
    Double[] countWeightedNeg = new Double[expectNumBins];
    Double[] countWeightedPos = new Double[expectNumBins];
    countNeg[0] = 0;
    countPos[0] = 0;
    countWeightedNeg[0] = 0.0;
    countWeightedPos[0] = 0.0;
    // add first bin (from negative infinite)
    this.binBoundary.add(Double.NEGATIVE_INFINITY);
    ValueObject vo = null;
    double prevData = voList.get(0).getValue();
    // For each Variable
    for (int i = 0; i < voSize; i++) {
        vo = voList.get(i);
        double currData = vo.getValue();
        // current bin is full
        if (countPos[currBin] >= binSize) {
            // still have some negative leftover
            if (currBin == expectNumBins - 1 && i != voList.size() - 1) {
                continue;
            }
            // and data is different from the previous pair
            if (i == 0 || (mergeEnabled == true && Math.abs(currData - prevData) > EPS) || mergeEnabled == false) {
                // MOVE to the new bin, if not the last vo
                if (i == voList.size() - 1) {
                    break;
                }
                currBin++;
                this.binBoundary.add(currData);
                // AFTER move to the new bin
                // currBinSumScore = 0;
                countNeg[currBin] = 0;
                countPos[currBin] = 0;
                countWeightedNeg[currBin] = 0.0;
                countWeightedPos[currBin] = 0.0;
            }
        }
        // increment the counter of the current bin
        if (negTags.contains(voList.get(i).getTag())) {
            countNeg[currBin]++;
            countWeightedNeg[currBin] += vo.getWeight();
        } else {
            countPos[currBin]++;
            countWeightedPos[currBin] += vo.getWeight();
        }
        prevData = currData;
    }
    // Finishing...
    // this.binBoundary.add(vo.getNumericalData());
    // this.binAvgScore.add(currBinSumScore / (countNeg[currBin] +
    // countPos[currBin]));
    this.actualNumBins = currBin + 1;
    for (int i = 0; i < this.actualNumBins; i++) {
        binCountNeg.add(countNeg[i]);
        binCountPos.add(countPos[i]);
        binAvgScore.add(0);
        binPosCaseRate.add((double) countPos[i] / (countPos[i] + countNeg[i]));
        this.binWeightedNeg.add(countWeightedNeg[i]);
        this.binWeightedPos.add(countWeightedPos[i]);
    }
}
Also used : ValueObject(ml.shifu.shifu.container.ValueObject)

Example 3 with ValueObject

use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.

the class BinningTest method autoTest.

@Test
public void autoTest() {
    Set<String> categorySet = new HashSet<String>();
    for (int i = 0; i < 3; i++) {
        ValueObject vo = new ValueObject();
        // vo.setValue(rdm.nextDouble());
        String input = Integer.toString(rdm.nextInt(100));
        categorySet.add(input);
        vo.setRaw(input);
        vo.setTag(Integer.toString(rdm.nextInt(2)));
        vo.setWeight(rdm.nextDouble());
        voList.add(vo);
    }
    List<String> posTags = new ArrayList<String>();
    posTags.add("1");
    List<String> negTag = new ArrayList<String>();
    negTag.add("0");
    binA = new Binning(posTags, negTag, BinningDataType.Auto, voList);
    binA.setMaxNumOfBins(6);
    binA.setBinningMethod(BinningMethod.EqualPositive);
    binA.setAutoTypeThreshold(1002);
    binA.setMergeEnabled(true);
    binA.doBinning();
    binA.setBinningMethod(BinningMethod.EqualTotal);
    binA.doBinning();
    binA.setBinningMethod(BinningMethod.EqualInterval);
    binA.doBinning();
// TODO test case
}
Also used : ValueObject(ml.shifu.shifu.container.ValueObject) Test(org.testng.annotations.Test)

Example 4 with ValueObject

use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.

the class DataPrepareWorker method convertRawDataIntoValueObject.

/*
     * Convert raw data into @ValueObject for calculating stats
     * 
     * @param rawDataList
     *            - raw data for training
     * @param columnVoListMap
     *            <column-id --> @ValueObject list>
     * @throws ShifuException
     *             if the data field length is not equal header length
     */
private DataPrepareStatsResult convertRawDataIntoValueObject(List<String> rawDataList, Map<Integer, List<ValueObject>> columnVoListMap) throws ShifuException {
    double sampleRate = modelConfig.getBinningSampleRate();
    long total = 0l;
    Map<Integer, Long> missingMap = new HashMap<Integer, Long>();
    for (String line : rawDataList) {
        total++;
        String[] raw = CommonUtils.split(line, modelConfig.getDataSetDelimiter());
        if (raw.length != columnConfigList.size()) {
            log.error("Expected Columns: " + columnConfigList.size() + ", but got: " + raw.length);
            throw new ShifuException(ShifuErrorCode.ERROR_NO_EQUAL_COLCONFIG);
        }
        String tag = CommonUtils.trimTag(raw[targetColumnNum]);
        if (modelConfig.isBinningSampleNegOnly()) {
            if (modelConfig.getNegTags().contains(tag) && random.nextDouble() > sampleRate) {
                continue;
            }
        } else {
            if (random.nextDouble() > sampleRate) {
                continue;
            }
        }
        for (int i = 0; i < raw.length; i++) {
            if (!columnNumToActorMap.containsKey(i)) {
                // ignore non-used columns
                continue;
            }
            ValueObject vo = new ValueObject();
            if (i >= columnConfigList.size()) {
                log.error("The input size is longer than expected, need to check your data");
                continue;
            }
            ColumnConfig config = columnConfigList.get(i);
            if (config.isNumerical()) {
                // NUMERICAL
                try {
                    vo.setValue(Double.valueOf(raw[i].trim()));
                    vo.setRaw(null);
                } catch (Exception e) {
                    log.debug("Column " + config.getColumnNum() + ": " + config.getColumnName() + " is expected to be NUMERICAL, however received: " + raw[i]);
                    incMap(i, missingMap);
                    continue;
                }
            } else if (config.isCategorical()) {
                // CATEGORICAL
                if (raw[i] == null || StringUtils.isEmpty(raw[i]) || modelConfig.getDataSet().getMissingOrInvalidValues().contains(raw[i].toLowerCase().trim())) {
                    incMap(i, missingMap);
                }
                vo.setRaw(raw[i].trim());
                vo.setValue(null);
            } else {
                // AUTO TYPE
                try {
                    vo.setValue(Double.valueOf(raw[i]));
                    vo.setRaw(null);
                } catch (Exception e) {
                    incMap(i, missingMap);
                    vo.setRaw(raw[i]);
                    vo.setValue(null);
                }
            }
            if (this.weightedColumnNum != -1) {
                try {
                    vo.setWeight(Double.valueOf(raw[weightedColumnNum]));
                } catch (NumberFormatException e) {
                    vo.setWeight(1.0);
                }
                vo.setWeight(1.0);
            }
            vo.setTag(tag);
            List<ValueObject> voList = columnVoListMap.get(i);
            if (voList == null) {
                voList = new ArrayList<ValueObject>();
                columnVoListMap.put(i, voList);
            }
            voList.add(vo);
        }
    }
    DataPrepareStatsResult rt = new DataPrepareStatsResult(total, missingMap);
    return rt;
}
Also used : ColumnConfig(ml.shifu.shifu.container.obj.ColumnConfig) HashMap(java.util.HashMap) IOException(java.io.IOException) ShifuException(ml.shifu.shifu.exception.ShifuException) ValueObject(ml.shifu.shifu.container.ValueObject) ShifuException(ml.shifu.shifu.exception.ShifuException)

Example 5 with ValueObject

use of ml.shifu.shifu.container.ValueObject in project shifu by ShifuML.

the class BinningTest method numericalTest.

@Test
public void numericalTest() {
    for (int i = 0; i < 5000; i++) {
        ValueObject vo = new ValueObject();
        vo.setValue(rdm.nextDouble());
        vo.setRaw(Integer.toString(rdm.nextInt(100)));
        vo.setTag(Integer.toString(rdm.nextInt(2)));
        vo.setWeight(rdm.nextDouble());
        voList.add(vo);
    }
    List<String> posTags = new ArrayList<String>();
    posTags.add("1");
    List<String> negTag = new ArrayList<String>();
    negTag.add("0");
    binN = new Binning(posTags, negTag, BinningDataType.Numerical, voList);
    binN.setMaxNumOfBins(numBin);
    binN.setBinningMethod(BinningMethod.EqualPositive);
    binN.setAutoTypeThreshold(3);
    binN.setMergeEnabled(true);
    binN.doBinning();
    binN.setBinningMethod(BinningMethod.EqualTotal);
    binN.doBinning();
    binN.setBinningMethod(BinningMethod.EqualInterval);
    binN.doBinning();
    Assert.assertEquals(binN.getNumBins(), numBin);
}
Also used : ValueObject(ml.shifu.shifu.container.ValueObject) Test(org.testng.annotations.Test)

Aggregations

ValueObject (ml.shifu.shifu.container.ValueObject)10 Test (org.testng.annotations.Test)3 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 ColumnConfig (ml.shifu.shifu.container.obj.ColumnConfig)1 BasicStatsCalculator (ml.shifu.shifu.core.BasicStatsCalculator)1 Binning (ml.shifu.shifu.core.Binning)1 BinningDataType (ml.shifu.shifu.core.Binning.BinningDataType)1 ColumnMetrics (ml.shifu.shifu.core.ColumnStatsCalculator.ColumnMetrics)1 ShifuException (ml.shifu.shifu.exception.ShifuException)1 DataBag (org.apache.pig.data.DataBag)1 Tuple (org.apache.pig.data.Tuple)1 TupleFactory (org.apache.pig.data.TupleFactory)1