Search in sources :

Example 1 with CombinedAttributeValues

use of org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues in project knime-core by knime.

the class TreeNominalColumnData method calcBestSplitClassificationBinaryPCA.

/**
 * Implements the approach proposed by Coppersmith et al. (1999) in their paper
 * "Partitioning Nominal Attributes in Decision Trees"
 *
 * @param membershipController
 * @param rowWeights
 * @param targetPriors
 * @param targetColumn
 * @param impCriterion
 * @param nomVals
 * @param targetVals
 * @param originalIndexInColumnList
 * @return the best binary split candidate or null if there is no valid split with positive gain
 */
private NominalBinarySplitCandidate calcBestSplitClassificationBinaryPCA(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
    final TreeEnsembleLearnerConfiguration config = getConfiguration();
    final int minChildSize = config.getMinChildSize();
    final boolean useXGBoostMissingValueHandling = config.getMissingValueHandling() == MissingValueHandling.XGBoost;
    // The algorithm combines attribute values with the same class probabilities into a single attribute
    // therefore it is necessary to track the known classProbabilities
    final LinkedHashMap<ClassProbabilityVector, CombinedAttributeValues> combinedAttValsMap = new LinkedHashMap<ClassProbabilityVector, CombinedAttributeValues>();
    columnMemberships.next();
    double totalWeight = 0.0;
    boolean branchContainsMissingValues = containsMissingValues();
    int start = 0;
    final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
    final int attToConsider = useXGBoostMissingValueHandling ? nomVals.length : lengthNonMissing;
    for (int att = 0; att < lengthNonMissing; /*attToConsider*/
    att++) {
        int end = start + m_nominalValueCounts[att];
        double attWeight = 0.0;
        final double[] classFrequencies = new double[targetVals.length];
        boolean reachedEnd = false;
        for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
            double weight = columnMemberships.getRowWeight();
            assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
            int instanceClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            classFrequencies[instanceClass] += weight;
            attWeight += weight;
            totalWeight += weight;
            if (!columnMemberships.next()) {
                // reached end of columnMemberships
                reachedEnd = true;
                if (att == nomVals.length - 1) {
                    // if the column contains no missing values, the last possible nominal value is
                    // not the missing value and therefore branchContainsMissingValues needs to be false
                    branchContainsMissingValues = branchContainsMissingValues && true;
                }
                break;
            }
        }
        start = end;
        if (attWeight < EPSILON) {
            // attribute value did not occur in this branch or sample
            continue;
        }
        final double[] classProbabilities = new double[targetVals.length];
        for (int i = 0; i < classProbabilities.length; i++) {
            classProbabilities[i] = truncateDouble(8, classFrequencies[i] / attWeight);
        }
        CombinedAttributeValues attVal = new CombinedAttributeValues(classFrequencies, classProbabilities, attWeight, nomVals[att]);
        ClassProbabilityVector classProbabilityVector = new ClassProbabilityVector(classProbabilities);
        CombinedAttributeValues knownAttVal = combinedAttValsMap.get(classProbabilityVector);
        if (knownAttVal == null) {
            combinedAttValsMap.put(classProbabilityVector, attVal);
        } else {
            knownAttVal.combineAttributeValues(attVal);
        }
        if (reachedEnd) {
            break;
        }
    }
    // account for missing values and their weight
    double missingWeight = 0.0;
    double[] missingClassCounts = null;
    // otherwise the current indexInColumn won't be larger than start
    if (columnMemberships.getIndexInColumn() >= start) {
        missingClassCounts = new double[targetVals.length];
        do {
            final double recordWeight = columnMemberships.getRowWeight();
            final int recordClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            missingWeight += recordWeight;
            missingClassCounts[recordClass] += recordWeight;
        } while (columnMemberships.next());
    }
    if (missingWeight > EPSILON) {
        branchContainsMissingValues = true;
    } else {
        branchContainsMissingValues = false;
    }
    ArrayList<CombinedAttributeValues> attValList = Lists.newArrayList(combinedAttValsMap.values());
    CombinedAttributeValues[] attVals = combinedAttValsMap.values().toArray(new CombinedAttributeValues[combinedAttValsMap.size()]);
    attVals = BinaryNominalSplitsPCA.calculatePCAOrdering(attVals, totalWeight, targetVals.length);
    // EigenDecomposition failed
    if (attVals == null) {
        return null;
    }
    // Start searching for split candidates
    final int highestBitPosition = containsMissingValues() ? nomVals.length - 2 : nomVals.length - 1;
    final double[] binaryImpurityValues = new double[2];
    final double[] binaryPartitionWeights = new double[2];
    double sumRemainingWeights = totalWeight;
    double sumCurrPartitionWeight = 0.0;
    RealVector targetFrequenciesCurrentPartition = MatrixUtils.createRealVector(new double[targetVals.length]);
    RealVector targetFrequenciesRemaining = MatrixUtils.createRealVector(new double[targetVals.length]);
    for (CombinedAttributeValues attVal : attValList) {
        targetFrequenciesRemaining = targetFrequenciesRemaining.add(attVal.m_classFrequencyVector);
    }
    BigInteger currPartitionBitMask = BigInteger.ZERO;
    double bestPartitionGain = Double.NEGATIVE_INFINITY;
    BigInteger bestPartitionMask = null;
    boolean isBestSplitValid = false;
    boolean missingsGoLeft = false;
    final double priorImpurity = useXGBoostMissingValueHandling ? targetPriors.getPriorImpurity() : impCriterion.getPartitionImpurity(subtractMissingClassCounts(targetPriors.getDistribution(), missingClassCounts), totalWeight);
    // no need to iterate over full list because at least one value must remain on the other side of the split
    for (int i = 0; i < attVals.length - 1; i++) {
        CombinedAttributeValues currAttVal = attVals[i];
        sumCurrPartitionWeight += currAttVal.m_totalWeight;
        sumRemainingWeights -= currAttVal.m_totalWeight;
        assert sumCurrPartitionWeight + sumRemainingWeights == totalWeight : "The weights of the partitions do not sum up to the total weight.";
        targetFrequenciesCurrentPartition = targetFrequenciesCurrentPartition.add(currAttVal.m_classFrequencyVector);
        targetFrequenciesRemaining = targetFrequenciesRemaining.subtract(currAttVal.m_classFrequencyVector);
        currPartitionBitMask = currPartitionBitMask.or(currAttVal.m_bitMask);
        boolean partitionIsRightBranch = currPartitionBitMask.testBit(highestBitPosition);
        boolean isValidSplit;
        double gain;
        boolean tempMissingsGoLeft = true;
        if (branchContainsMissingValues && useXGBoostMissingValueHandling) {
            // send missing values with partition
            boolean isValidSplitFirst = sumCurrPartitionWeight + missingWeight >= minChildSize && sumRemainingWeights >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(addMissingClassCounts(targetFrequenciesCurrentPartition.toArray(), missingClassCounts), sumCurrPartitionWeight + missingWeight);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetFrequenciesRemaining.toArray(), sumRemainingWeights);
            binaryPartitionWeights[0] = sumCurrPartitionWeight + missingWeight;
            binaryPartitionWeights[1] = sumRemainingWeights;
            double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
            double gainFirst = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
            // send missing values with remaining
            boolean isValidSplitSecond = sumCurrPartitionWeight >= minChildSize && sumRemainingWeights + missingWeight >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetFrequenciesCurrentPartition.toArray(), sumCurrPartitionWeight);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(addMissingClassCounts(targetFrequenciesRemaining.toArray(), missingClassCounts), sumRemainingWeights + missingWeight);
            binaryPartitionWeights[0] = sumCurrPartitionWeight;
            binaryPartitionWeights[1] = sumRemainingWeights + missingWeight;
            postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
            double gainSecond = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
            // choose alternative with better gain
            if (gainFirst >= gainSecond) {
                gain = gainFirst;
                isValidSplit = isValidSplitFirst;
                tempMissingsGoLeft = !partitionIsRightBranch;
            } else {
                gain = gainSecond;
                isValidSplit = isValidSplitSecond;
                tempMissingsGoLeft = partitionIsRightBranch;
            }
        } else {
            // TODO if invalid splits should not be considered skip partition
            isValidSplit = sumCurrPartitionWeight >= minChildSize && sumRemainingWeights >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetFrequenciesCurrentPartition.toArray(), sumCurrPartitionWeight);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetFrequenciesRemaining.toArray(), sumRemainingWeights);
            binaryPartitionWeights[0] = sumCurrPartitionWeight;
            binaryPartitionWeights[1] = sumRemainingWeights;
            double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
            gain = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight);
        }
        // use random tie breaker if gains are equal
        boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
        // store if better than before or first valid split
        if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
            if (isValidSplit || !isBestSplitValid) {
                bestPartitionGain = gain;
                bestPartitionMask = partitionIsRightBranch ? currPartitionBitMask : BigInteger.ZERO.setBit(highestBitPosition + 1).subtract(BigInteger.ONE).xor(currPartitionBitMask);
                isBestSplitValid = isValidSplit;
                if (branchContainsMissingValues) {
                    missingsGoLeft = tempMissingsGoLeft;
                // missing values are encountered during the search for the best split
                // missingsGoLeft = partitionIsRightBranch;
                } else {
                    // no missing values were encountered during the search for the best split
                    // missing values should be sent with the majority
                    missingsGoLeft = partitionIsRightBranch ? sumCurrPartitionWeight < sumRemainingWeights : sumCurrPartitionWeight >= sumRemainingWeights;
                }
            }
        }
    }
    if (isBestSplitValid && bestPartitionGain > 0.0) {
        if (useXGBoostMissingValueHandling) {
            return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, NO_MISSED_ROWS, missingsGoLeft ? NominalBinarySplitCandidate.MISSINGS_GO_LEFT : NominalBinarySplitCandidate.MISSINGS_GO_RIGHT);
        }
        return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
    }
    return null;
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) LinkedHashMap(java.util.LinkedHashMap) RealVector(org.apache.commons.math3.linear.RealVector) BigInteger(java.math.BigInteger) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) CombinedAttributeValues(org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues)

Example 2 with CombinedAttributeValues

use of org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues in project knime-core by knime.

the class BinaryNominalSplitsPCATest method testCalculateMeanClassProbabilityVector.

@Test
public void testCalculateMeanClassProbabilityVector() {
    final CombinedAttributeValues[] attVals = createTestAttVals();
    final double totalSumWeight = 300;
    final int numTargetVals = 3;
    final RealVector meanClassProbabilityVector = BinaryNominalSplitsPCA.calculateMeanClassProbabilityVector(attVals, totalSumWeight, numTargetVals);
    final double aThird = 1.0 / 3.0;
    final RealVector expectedMeanClassProbabilityVector = MatrixUtils.createRealVector(new double[] { aThird, aThird, aThird });
    assertEquals(expectedMeanClassProbabilityVector, meanClassProbabilityVector);
}
Also used : RealVector(org.apache.commons.math3.linear.RealVector) CombinedAttributeValues(org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues) Test(org.junit.Test)

Example 3 with CombinedAttributeValues

use of org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues in project knime-core by knime.

the class BinaryNominalSplitsPCATest method createTestAttVals.

private static CombinedAttributeValues[] createTestAttVals() {
    CombinedAttributeValues[] attVals = new CombinedAttributeValues[5];
    double[][] classFrequencies = new double[][] { { 40, 10, 10 }, { 10, 40, 10 }, { 20, 30, 10 }, { 20, 15, 25 }, { 10, 5, 45 } };
    double[][] classProbabilities = new double[5][3];
    double totalWeight = 60;
    String[] nomValStrings = new String[] { "A", "B", "C", "D", "E" };
    NominalValueRepresentation[] nomVals = new NominalValueRepresentation[5];
    for (int i = 0; i < 5; i++) {
        nomVals[i] = new NominalValueRepresentation(nomValStrings[i], i, totalWeight);
        for (int j = 0; j < 3; j++) {
            classProbabilities[i][j] = classFrequencies[i][j] / totalWeight;
        }
    }
    for (int i = 0; i < 5; i++) {
        attVals[i] = new CombinedAttributeValues(classFrequencies[i], classProbabilities[i], totalWeight, nomVals[i]);
    }
    return attVals;
}
Also used : NominalValueRepresentation(org.knime.base.node.mine.treeensemble2.data.NominalValueRepresentation) CombinedAttributeValues(org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues)

Example 4 with CombinedAttributeValues

use of org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues in project knime-core by knime.

the class BinaryNominalSplitsPCATest method testCalculateWeightedCovarianceMatrix.

@Test
public void testCalculateWeightedCovarianceMatrix() {
    final CombinedAttributeValues[] attVals = createTestAttVals();
    final double totalSumWeight = 300;
    final int numTargetVals = 3;
    final RealVector meanClassProbabilityVector = BinaryNominalSplitsPCA.calculateMeanClassProbabilityVector(attVals, totalSumWeight, numTargetVals);
    RealMatrix weightedCovarianceMatrix = BinaryNominalSplitsPCA.calculateWeightedCovarianceMatrix(attVals, meanClassProbabilityVector, totalSumWeight, numTargetVals);
    // the reference matrix is altered to be easily readable therefore we have to do the same to the calculated matrix
    weightedCovarianceMatrix = weightedCovarianceMatrix.scalarMultiply(1 / weightedCovarianceMatrix.getEntry(0, 0));
    weightedCovarianceMatrix = weightedCovarianceMatrix.scalarMultiply(10);
    final RealMatrix expectedWeightedCovarianceMatrix = MatrixUtils.createRealMatrix(new double[][] { { 10.0, -4.167, -5.833 }, { -4.167, 14.167, -10.0 }, { -5.833, -10.0, 15.833 } });
    // RealMatrix does overwrite equals but all entries must be exactly the same for two matrices to be equal
    // Therefore we need to use the asserEquals method that allows to define a delta
    assertEquals(expectedWeightedCovarianceMatrix.getRowDimension(), weightedCovarianceMatrix.getRowDimension());
    assertEquals(expectedWeightedCovarianceMatrix.getColumnDimension(), weightedCovarianceMatrix.getColumnDimension());
    for (int r = 0; r < weightedCovarianceMatrix.getRowDimension(); r++) {
        for (int c = 0; c < weightedCovarianceMatrix.getColumnDimension(); c++) {
            assertEquals(expectedWeightedCovarianceMatrix.getEntry(r, c), weightedCovarianceMatrix.getEntry(r, c), 0.001);
        }
    }
}
Also used : RealMatrix(org.apache.commons.math3.linear.RealMatrix) RealVector(org.apache.commons.math3.linear.RealVector) CombinedAttributeValues(org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues) Test(org.junit.Test)

Aggregations

CombinedAttributeValues (org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues)4 RealVector (org.apache.commons.math3.linear.RealVector)3 Test (org.junit.Test)2 BigInteger (java.math.BigInteger)1 LinkedHashMap (java.util.LinkedHashMap)1 RealMatrix (org.apache.commons.math3.linear.RealMatrix)1 NominalValueRepresentation (org.knime.base.node.mine.treeensemble2.data.NominalValueRepresentation)1 NominalBinarySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)1 TreeEnsembleLearnerConfiguration (org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration)1