Search in sources :

Example 11 with NominalBinarySplitCandidate

use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.

the class TreeNominalColumnDataTest method testCalcBestSplitClassificationBinary.

/**
 * Tests the method
 * {@link TreeNominalColumnData#calcBestSplitClassification(DataMemberships, ClassificationPriors, TreeTargetNominalColumnData, RandomData)}
 * using binary splits.
 *
 * @throws Exception
 */
@Test
public void testCalcBestSplitClassificationBinary() throws Exception {
    final TreeEnsembleLearnerConfiguration config = createConfig(false);
    Pair<TreeNominalColumnData, TreeTargetNominalColumnData> tennisData = tennisData(config);
    TreeNominalColumnData columnData = tennisData.getFirst();
    TreeTargetNominalColumnData targetData = tennisData.getSecond();
    assertEquals(SplitCriterion.Gini, config.getSplitCriterion());
    double[] rowWeights = new double[SMALL_COLUMN_DATA.length];
    Arrays.fill(rowWeights, 1.0);
    TreeData tennisTreeData = tennisTreeData(config);
    IDataIndexManager indexManager = new DefaultDataIndexManager(tennisTreeData);
    DataMemberships dataMemberships = new RootDataMemberships(rowWeights, tennisTreeData, indexManager);
    ClassificationPriors priors = targetData.getDistribution(rowWeights, config);
    SplitCandidate splitCandidate = columnData.calcBestSplitClassification(dataMemberships, priors, targetData, null);
    assertNotNull(splitCandidate);
    assertThat(splitCandidate, instanceOf(NominalBinarySplitCandidate.class));
    assertTrue(splitCandidate.canColumnBeSplitFurther());
    // manually via libre office calc
    assertEquals(0.0689342404, splitCandidate.getGainValue(), 0.00001);
    NominalBinarySplitCandidate binSplitCandidate = (NominalBinarySplitCandidate) splitCandidate;
    TreeNodeNominalBinaryCondition[] childConditions = binSplitCandidate.getChildConditions();
    assertEquals(2, childConditions.length);
    assertArrayEquals(new String[] { "R" }, childConditions[0].getValues());
    assertArrayEquals(new String[] { "R" }, childConditions[1].getValues());
    assertEquals(SetLogic.IS_NOT_IN, childConditions[0].getSetLogic());
    assertEquals(SetLogic.IS_IN, childConditions[1].getSetLogic());
    BitSet inChild = columnData.updateChildMemberships(childConditions[0], dataMemberships);
    DataMemberships child1Memberships = dataMemberships.createChildMemberships(inChild);
    ClassificationPriors childTargetPriors = targetData.getDistribution(child1Memberships, config);
    SplitCandidate splitCandidateChild = columnData.calcBestSplitClassification(child1Memberships, childTargetPriors, targetData, null);
    assertNotNull(splitCandidateChild);
    assertThat(splitCandidateChild, instanceOf(NominalBinarySplitCandidate.class));
    // manually via libre office calc
    assertEquals(0.0086419753, splitCandidateChild.getGainValue(), 0.00001);
    inChild = columnData.updateChildMemberships(childConditions[1], dataMemberships);
    DataMemberships child2Memberships = dataMemberships.createChildMemberships(inChild);
    childTargetPriors = targetData.getDistribution(child2Memberships, config);
    splitCandidateChild = columnData.calcBestSplitClassification(child2Memberships, childTargetPriors, targetData, null);
    assertNull(splitCandidateChild);
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) BitSet(java.util.BitSet) IDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager) NominalMultiwaySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) SplitCandidate(org.knime.base.node.mine.treeensemble2.learner.SplitCandidate) DefaultDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager) DataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) TreeNodeNominalBinaryCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) Test(org.junit.Test)

Example 12 with NominalBinarySplitCandidate

use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.

the class TreeNominalColumnDataTest method testCalcBestSplitClassificationBinaryPCA.

/**
 * Tests the method
 * {@link TreeNominalColumnData#calcBestSplitClassification(DataMemberships, ClassificationPriors, TreeTargetNominalColumnData, RandomData)}
 * using binary splits. In this test case the data has more than two classes and the used algorithm is therefore PCA
 * based.
 *
 * @throws Exception
 */
@Test
public void testCalcBestSplitClassificationBinaryPCA() throws Exception {
    TreeEnsembleLearnerConfiguration config = createConfig(false);
    Pair<TreeNominalColumnData, TreeTargetNominalColumnData> pcaData = createPCATestData(config);
    TreeNominalColumnData columnData = pcaData.getFirst();
    TreeTargetNominalColumnData targetData = pcaData.getSecond();
    TreeData treeData = createTreeData(pcaData);
    assertEquals(SplitCriterion.Gini, config.getSplitCriterion());
    double[] rowWeights = new double[targetData.getNrRows()];
    Arrays.fill(rowWeights, 1.0);
    IDataIndexManager indexManager = new DefaultDataIndexManager(treeData);
    DataMemberships dataMemberships = new RootDataMemberships(rowWeights, treeData, indexManager);
    ClassificationPriors priors = targetData.getDistribution(rowWeights, config);
    SplitCandidate splitCandidate = columnData.calcBestSplitClassification(dataMemberships, priors, targetData, null);
    assertNotNull(splitCandidate);
    assertThat(splitCandidate, instanceOf(NominalBinarySplitCandidate.class));
    assertTrue(splitCandidate.canColumnBeSplitFurther());
    assertEquals(0.0659, splitCandidate.getGainValue(), 0.0001);
    NominalBinarySplitCandidate binarySplitCandidate = (NominalBinarySplitCandidate) splitCandidate;
    TreeNodeNominalBinaryCondition[] childConditions = binarySplitCandidate.getChildConditions();
    assertEquals(2, childConditions.length);
    assertArrayEquals(new String[] { "E" }, childConditions[0].getValues());
    assertArrayEquals(new String[] { "E" }, childConditions[1].getValues());
    assertEquals(SetLogic.IS_NOT_IN, childConditions[0].getSetLogic());
    assertEquals(SetLogic.IS_IN, childConditions[1].getSetLogic());
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) IDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager) NominalMultiwaySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) SplitCandidate(org.knime.base.node.mine.treeensemble2.learner.SplitCandidate) DefaultDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager) DataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) TreeNodeNominalBinaryCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) Test(org.junit.Test)

Example 13 with NominalBinarySplitCandidate

use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.

the class TreeNominalColumnData method calcBestSplitClassificationBinaryTwoClass.

private NominalBinarySplitCandidate calcBestSplitClassificationBinaryTwoClass(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
    if (targetColumn.getMetaData().getValues().length != 2) {
        throw new IllegalArgumentException("This method can only be used for two class problems.");
    }
    final TreeEnsembleLearnerConfiguration config = getConfiguration();
    final int minChildSize = config.getMinChildSize();
    final boolean useXGBoostMissingValueHandling = config.getMissingValueHandling() == MissingValueHandling.XGBoost;
    int start = 0;
    final int firstClass = targetColumn.getMetaData().getValues()[0].getAssignedInteger();
    double totalWeight = 0.0;
    double totalFirstClassWeight = 0.0;
    final ArrayList<NomValProbabilityPair> nomValProbabilities = new ArrayList<NomValProbabilityPair>();
    if (!columnMemberships.next()) {
        throw new IllegalStateException("The columnMemberships has not been reset or is empty.");
    }
    final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
    // final int attToConsider = useXGBoostMissingValueHandling ? nomVals.length : lengthNonMissing;
    boolean branchContainsMissingValues = containsMissingValues();
    // calculate probabilities for first class in each nominal value
    for (int att = 0; att < /*attToConsider*/
    lengthNonMissing; att++) {
        int end = start + m_nominalValueCounts[att];
        double attFirstClassWeight = 0;
        double attWeight = 0;
        boolean reachedEnd = false;
        for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
            double weight = columnMemberships.getRowWeight();
            assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
            final int instanceClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            if (instanceClass == firstClass) {
                attFirstClassWeight += weight;
                totalFirstClassWeight += weight;
            }
            attWeight += weight;
            totalWeight += weight;
            if (!columnMemberships.next()) {
                // reached end of columnMemberships
                reachedEnd = true;
                if (att == nomVals.length - 1) {
                    // if the column contains no missing values, the last possible nominal value is
                    // not the missing value and therefore branchContainsMissingValues needs to be false
                    branchContainsMissingValues = branchContainsMissingValues && true;
                }
                break;
            }
        }
        if (attWeight > 0) {
            final double firstClassProbability = attFirstClassWeight / attWeight;
            final NominalValueRepresentation nomVal = getMetaData().getValues()[att];
            nomValProbabilities.add(new NomValProbabilityPair(nomVal, firstClassProbability, attWeight, attFirstClassWeight));
        }
        start = end;
        if (reachedEnd) {
            break;
        }
    }
    // account for missing values and their weight
    double missingWeight = 0.0;
    double missingWeightFirstClass = 0.0;
    // otherwise the current indexInColumn won't be larger than start
    if (columnMemberships.getIndexInColumn() >= start) {
        do {
            final double recordWeight = columnMemberships.getRowWeight();
            missingWeight += recordWeight;
            final int recordClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            if (recordClass == firstClass) {
                missingWeightFirstClass += recordWeight;
            }
        } while (columnMemberships.next());
    }
    if (missingWeight > EPSILON) {
        branchContainsMissingValues = true;
    }
    nomValProbabilities.sort(null);
    int highestBitPosition = getMetaData().getValues().length - 1;
    if (containsMissingValues()) {
        highestBitPosition--;
    }
    final double[] targetCountsSplitPartition = new double[2];
    final double[] targetCountsSplitRemaining = new double[2];
    final double[] binaryImpurityValues = new double[2];
    final double[] binaryPartitionWeights = new double[2];
    BigInteger partitionMask = BigInteger.ZERO;
    double bestPartitionGain = Double.NEGATIVE_INFINITY;
    BigInteger bestPartitionMask = null;
    boolean isBestSplitValid = false;
    double sumWeightsPartitionTotal = 0.0;
    double sumWeightsPartitionFirstClass = 0.0;
    boolean missingsGoLeft = false;
    final double priorImpurity = useXGBoostMissingValueHandling ? targetPriors.getPriorImpurity() : impCriterion.getPartitionImpurity(subtractMissingClassCounts(targetPriors.getDistribution(), createMissingClassCountsTwoClass(missingWeight, missingWeightFirstClass)), totalWeight);
    // we don't need to iterate over the full list because we always need some value on the other side
    for (int i = 0; i < nomValProbabilities.size() - 1; i++) {
        NomValProbabilityPair nomVal = nomValProbabilities.get(i);
        sumWeightsPartitionTotal += nomVal.m_sumWeights;
        sumWeightsPartitionFirstClass += nomVal.m_firstClassSumWeights;
        partitionMask = partitionMask.or(nomVal.m_bitMask);
        // check if split represented by currentSplitList is in the right branch
        // by convention a split goes towards the right branch if the highest possible bit is set to 1
        final boolean isRightBranch = partitionMask.testBit(highestBitPosition);
        double gain;
        boolean isValidSplit;
        boolean tempMissingsGoLeft = true;
        if (branchContainsMissingValues && useXGBoostMissingValueHandling) {
            // send missing values both ways and take the better direction
            // send missings left
            targetCountsSplitPartition[0] = sumWeightsPartitionFirstClass + missingWeightFirstClass;
            targetCountsSplitPartition[1] = sumWeightsPartitionTotal + missingWeight - targetCountsSplitPartition[0];
            binaryPartitionWeights[1] = sumWeightsPartitionTotal + missingWeight;
            // totalFirstClassWeight and totalWeight only include non missing values
            targetCountsSplitRemaining[0] = totalFirstClassWeight - sumWeightsPartitionFirstClass;
            targetCountsSplitRemaining[1] = totalWeight - sumWeightsPartitionTotal - targetCountsSplitRemaining[0];
            binaryPartitionWeights[0] = totalWeight - sumWeightsPartitionTotal;
            boolean isValidSplitLeft = binaryPartitionWeights[0] >= minChildSize && binaryPartitionWeights[1] >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetCountsSplitRemaining, binaryPartitionWeights[0]);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetCountsSplitPartition, binaryPartitionWeights[1]);
            double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
            double gainLeft = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
            // send missings right
            targetCountsSplitPartition[0] = sumWeightsPartitionFirstClass;
            targetCountsSplitPartition[1] = sumWeightsPartitionTotal - sumWeightsPartitionFirstClass;
            binaryPartitionWeights[1] = sumWeightsPartitionTotal;
            targetCountsSplitRemaining[0] = totalFirstClassWeight - sumWeightsPartitionFirstClass + missingWeightFirstClass;
            targetCountsSplitRemaining[1] = totalWeight - sumWeightsPartitionTotal + missingWeight - targetCountsSplitRemaining[0];
            binaryPartitionWeights[0] = totalWeight + missingWeight - sumWeightsPartitionTotal;
            boolean isValidSplitRight = binaryPartitionWeights[0] >= minChildSize && binaryPartitionWeights[1] >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetCountsSplitRemaining, binaryPartitionWeights[0]);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetCountsSplitPartition, binaryPartitionWeights[1]);
            postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
            double gainRight = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
            // decide which is better (better gain)
            if (gainLeft >= gainRight) {
                gain = gainLeft;
                isValidSplit = isValidSplitLeft;
                tempMissingsGoLeft = true;
            } else {
                gain = gainRight;
                isValidSplit = isValidSplitRight;
                tempMissingsGoLeft = false;
            }
        } else {
            // assign weights to branches
            targetCountsSplitPartition[0] = sumWeightsPartitionFirstClass;
            targetCountsSplitPartition[1] = sumWeightsPartitionTotal - sumWeightsPartitionFirstClass;
            binaryPartitionWeights[1] = sumWeightsPartitionTotal;
            targetCountsSplitRemaining[0] = totalFirstClassWeight - sumWeightsPartitionFirstClass;
            targetCountsSplitRemaining[1] = totalWeight - sumWeightsPartitionTotal - targetCountsSplitRemaining[0];
            binaryPartitionWeights[0] = totalWeight - sumWeightsPartitionTotal;
            isValidSplit = binaryPartitionWeights[0] >= minChildSize && binaryPartitionWeights[1] >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetCountsSplitRemaining, binaryPartitionWeights[0]);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetCountsSplitPartition, binaryPartitionWeights[1]);
            double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
            gain = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight);
        }
        // use random tie breaker if gains are equal
        boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
        // store if better than before or first valid split
        if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
            if (isValidSplit || !isBestSplitValid) {
                bestPartitionGain = gain;
                bestPartitionMask = isRightBranch ? partitionMask : BigInteger.ZERO.setBit(highestBitPosition + 1).subtract(BigInteger.ONE).xor(partitionMask);
                isBestSplitValid = isValidSplit;
                // missingsGoLeft is only used later on if XGBoost Missing Value Handling is used
                if (branchContainsMissingValues) {
                    // missingsGoLeft = isRightBranch;
                    missingsGoLeft = tempMissingsGoLeft;
                } else {
                    // no missing values in this branch
                    // send missing values with the majority
                    missingsGoLeft = isRightBranch ? sumWeightsPartitionTotal < 0.5 * totalWeight : sumWeightsPartitionTotal >= 0.5 * totalWeight;
                }
            }
        }
    }
    if (isBestSplitValid && bestPartitionGain > 0.0) {
        if (useXGBoostMissingValueHandling) {
            return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, NO_MISSED_ROWS, missingsGoLeft ? NominalBinarySplitCandidate.MISSINGS_GO_LEFT : NominalBinarySplitCandidate.MISSINGS_GO_RIGHT);
        }
        return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
    }
    return null;
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) ArrayList(java.util.ArrayList) BigInteger(java.math.BigInteger) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)

Aggregations

NominalBinarySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)13 TreeEnsembleLearnerConfiguration (org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration)10 DataMemberships (org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships)8 RootDataMemberships (org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships)8 NominalMultiwaySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate)8 SplitCandidate (org.knime.base.node.mine.treeensemble2.learner.SplitCandidate)8 TreeNodeNominalBinaryCondition (org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition)8 Test (org.junit.Test)7 BigInteger (java.math.BigInteger)5 DefaultDataIndexManager (org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager)5 IDataIndexManager (org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager)5 RandomData (org.apache.commons.math.random.RandomData)3 ArrayList (java.util.ArrayList)1 BitSet (java.util.BitSet)1 LinkedHashMap (java.util.LinkedHashMap)1 RealVector (org.apache.commons.math3.linear.RealVector)1 CombinedAttributeValues (org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues)1