Search in sources :

Example 1 with BinarySplitEnumeration

use of org.knime.base.node.mine.treeensemble2.data.TreeNominalColumnData.BinarySplitEnumeration in project knime-core by knime.

the class TreeNominalColumnData method calcBestSplitClassificationBinary.

NominalBinarySplitCandidate calcBestSplitClassificationBinary(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
    if (nomVals.length <= 1) {
        return null;
    }
    final int minChildSize = getConfiguration().getMinChildSize();
    final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
    // distribution of target for each attribute value
    final double[][] targetCountsSplitPerAttribute = new double[lengthNonMissing][targetVals.length];
    // number of valid records for each attribute value
    final double[] attWeights = new double[lengthNonMissing];
    // number (sum) of total valid values
    double totalWeight = 0.0;
    int start = 0;
    columnMemberships.next();
    for (int att = 0; att < lengthNonMissing; att++) {
        final int end = start + m_nominalValueCounts[att];
        double currentAttValWeight = 0.0;
        for (int index = columnMemberships.getIndexInColumn(); index < end; columnMemberships.next(), index = columnMemberships.getIndexInColumn()) {
            final double weight = columnMemberships.getRowWeight();
            assert weight > EPSILON : "The usage of datamemberships should ensure that no rows with zero weight are encountered";
            int target = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            targetCountsSplitPerAttribute[att][target] += weight;
            currentAttValWeight += weight;
        }
        totalWeight += currentAttValWeight;
        attWeights[att] = currentAttValWeight;
        start = end;
    }
    BinarySplitEnumeration splitEnumeration;
    if (nomVals.length <= 10) {
        splitEnumeration = new FullBinarySplitEnumeration(nomVals.length);
    } else {
        int maxSearch = (1 << 10 - 2);
        splitEnumeration = new RandomBinarySplitEnumeration(nomVals.length, maxSearch, rd);
    }
    BigInteger bestPartitionMask = null;
    boolean isBestSplitValid = false;
    double bestPartitionGain = Double.NEGATIVE_INFINITY;
    final double[] targetCountsSplitLeft = new double[targetVals.length];
    final double[] targetCountsSplitRight = new double[targetVals.length];
    final double[] binaryImpurityValues = new double[2];
    final double[] binaryPartitionWeights = new double[2];
    do {
        Arrays.fill(targetCountsSplitLeft, 0.0);
        Arrays.fill(targetCountsSplitRight, 0.0);
        double weightLeft = 0.0;
        double weightRight = 0.0;
        for (int i = 0; i < nomVals.length; i++) {
            final boolean isAttributeInRightBranch = splitEnumeration.isInRightBranch(i);
            double[] targetCountsCurrentAttribute = targetCountsSplitPerAttribute[i];
            for (int targetVal = 0; targetVal < targetVals.length; targetVal++) {
                if (isAttributeInRightBranch) {
                    targetCountsSplitRight[targetVal] += targetCountsCurrentAttribute[targetVal];
                } else {
                    targetCountsSplitLeft[targetVal] += targetCountsCurrentAttribute[targetVal];
                }
            }
            if (isAttributeInRightBranch) {
                weightRight += attWeights[i];
            } else {
                weightLeft += attWeights[i];
            }
        }
        binaryPartitionWeights[0] = weightRight;
        binaryPartitionWeights[1] = weightLeft;
        boolean isValidSplit = weightRight >= minChildSize && weightLeft >= minChildSize;
        binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetCountsSplitRight, weightRight);
        binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetCountsSplitLeft, weightLeft);
        double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
        double gain = impCriterion.getGain(targetPriors.getPriorImpurity(), postSplitImpurity, binaryPartitionWeights, totalWeight);
        // use random tie breaker if gains are equal
        boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
        // store if better than before or first valid split
        if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
            if (isValidSplit || !isBestSplitValid) {
                bestPartitionGain = gain;
                bestPartitionMask = splitEnumeration.getValueMask();
                isBestSplitValid = isValidSplit;
            }
        }
    } while (splitEnumeration.next());
    if (bestPartitionGain > 0.0) {
        return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
    }
    return null;
}
Also used : BigInteger(java.math.BigInteger) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)

Example 2 with BinarySplitEnumeration

use of org.knime.base.node.mine.treeensemble2.data.TreeNominalColumnData.BinarySplitEnumeration in project knime-core by knime.

the class TreeNominalColumnData method calcBestSplitRegressionBinary.

private NominalBinarySplitCandidate calcBestSplitRegressionBinary(final ColumnMemberships columnMemberships, final RegressionPriors targetPriors, final TreeTargetNumericColumnData targetColumn, final NominalValueRepresentation[] nomVals, final RandomData rd) {
    final int minChildSize = getConfiguration().getMinChildSize();
    final double ySumTotal = targetPriors.getYSum();
    final double nrRecordsTotal = targetPriors.getNrRecords();
    final double criterionTotal = ySumTotal * ySumTotal / nrRecordsTotal;
    final double[] ySums = new double[nomVals.length];
    final double[] sumWeightsAttributes = new double[nomVals.length];
    columnMemberships.next();
    int start = 0;
    for (int att = 0; att < nomVals.length; att++) {
        int end = start + m_nominalValueCounts[att];
        double weightSum = 0.0;
        double ySum = 0.0;
        boolean reachedEnd = false;
        for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
            final double weight = columnMemberships.getRowWeight();
            assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
            ySum += weight * targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            weightSum += weight;
            if (!columnMemberships.next()) {
                // reached end of columnMemberships
                reachedEnd = true;
                break;
            }
        }
        sumWeightsAttributes[att] = weightSum;
        ySums[att] = ySum;
        start = end;
        if (reachedEnd) {
            break;
        }
    }
    BinarySplitEnumeration splitEnumeration;
    if (nomVals.length <= 10) {
        splitEnumeration = new FullBinarySplitEnumeration(nomVals.length);
    } else {
        int maxSearch = (1 << 10 - 2);
        splitEnumeration = new RandomBinarySplitEnumeration(nomVals.length, maxSearch, rd);
    }
    BigInteger bestPartitionMask = null;
    boolean isBestSplitValid = false;
    double bestPartitionGain = Double.NEGATIVE_INFINITY;
    do {
        double weightLeft = 0.0;
        double ySumLeft = 0.0;
        double weightRight = 0.0;
        double ySumRight = 0.0;
        for (int i = 0; i < nomVals.length; i++) {
            final boolean isAttributeInRightBranch = splitEnumeration.isInRightBranch(i);
            if (isAttributeInRightBranch) {
                weightRight += sumWeightsAttributes[i];
                ySumRight += ySums[i];
            } else {
                weightLeft += sumWeightsAttributes[i];
                ySumLeft += ySums[i];
            }
        }
        final boolean isValidSplit = weightRight >= minChildSize && weightLeft >= minChildSize;
        double gain = ySumRight * ySumRight / weightRight + ySumLeft * ySumLeft / weightLeft - criterionTotal;
        // use random tie breaker if gains are equal
        boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
        // store if better than before or first valid split
        if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
            if (isValidSplit || !isBestSplitValid) {
                bestPartitionGain = gain;
                bestPartitionMask = splitEnumeration.getValueMask();
                isBestSplitValid = isValidSplit;
            }
        }
    } while (splitEnumeration.next());
    if (bestPartitionGain > 0.0) {
        return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
    }
    return null;
}
Also used : BigInteger(java.math.BigInteger) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)

Example 3 with BinarySplitEnumeration

use of org.knime.base.node.mine.treeensemble2.data.TreeNominalColumnData.BinarySplitEnumeration in project knime-core by knime.

the class FullBinarySplitEnumerationTest method testBinarySplitEnumerationCountTuples.

@Test(timeout = 2000L)
public void testBinarySplitEnumerationCountTuples() {
    byte maxNrUniqueValues = 10;
    for (byte nrUniqueValues = 2; nrUniqueValues < maxNrUniqueValues; nrUniqueValues++) {
        BinarySplitEnumeration instance = new FullBinarySplitEnumeration(nrUniqueValues);
        final int expectedTupleCount = (int) Math.pow(2, nrUniqueValues - 1) - 1;
        int count = 0;
        do {
            count++;
        } while (instance.next());
        Assert.assertEquals("For test count = " + nrUniqueValues, expectedTupleCount, count);
    }
}
Also used : BinarySplitEnumeration(org.knime.base.node.mine.treeensemble2.data.TreeNominalColumnData.BinarySplitEnumeration) FullBinarySplitEnumeration(org.knime.base.node.mine.treeensemble2.data.TreeNominalColumnData.FullBinarySplitEnumeration) FullBinarySplitEnumeration(org.knime.base.node.mine.treeensemble2.data.TreeNominalColumnData.FullBinarySplitEnumeration) Test(org.junit.Test)

Aggregations

BigInteger (java.math.BigInteger)2 NominalBinarySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)2 Test (org.junit.Test)1 BinarySplitEnumeration (org.knime.base.node.mine.treeensemble2.data.TreeNominalColumnData.BinarySplitEnumeration)1 FullBinarySplitEnumeration (org.knime.base.node.mine.treeensemble2.data.TreeNominalColumnData.FullBinarySplitEnumeration)1