Search in sources :

Example 1 with NominalBinarySplitCandidate

use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.

the class TreeNominalColumnData method calcBestSplitClassificationBinaryPCA.

/**
 * Implements the approach proposed by Coppersmith et al. (1999) in their paper
 * "Partitioning Nominal Attributes in Decision Trees"
 *
 * @param membershipController
 * @param rowWeights
 * @param targetPriors
 * @param targetColumn
 * @param impCriterion
 * @param nomVals
 * @param targetVals
 * @param originalIndexInColumnList
 * @return the best binary split candidate or null if there is no valid split with positive gain
 */
private NominalBinarySplitCandidate calcBestSplitClassificationBinaryPCA(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
    final TreeEnsembleLearnerConfiguration config = getConfiguration();
    final int minChildSize = config.getMinChildSize();
    final boolean useXGBoostMissingValueHandling = config.getMissingValueHandling() == MissingValueHandling.XGBoost;
    // The algorithm combines attribute values with the same class probabilities into a single attribute
    // therefore it is necessary to track the known classProbabilities
    final LinkedHashMap<ClassProbabilityVector, CombinedAttributeValues> combinedAttValsMap = new LinkedHashMap<ClassProbabilityVector, CombinedAttributeValues>();
    columnMemberships.next();
    double totalWeight = 0.0;
    boolean branchContainsMissingValues = containsMissingValues();
    int start = 0;
    final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
    final int attToConsider = useXGBoostMissingValueHandling ? nomVals.length : lengthNonMissing;
    for (int att = 0; att < lengthNonMissing; /*attToConsider*/
    att++) {
        int end = start + m_nominalValueCounts[att];
        double attWeight = 0.0;
        final double[] classFrequencies = new double[targetVals.length];
        boolean reachedEnd = false;
        for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
            double weight = columnMemberships.getRowWeight();
            assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
            int instanceClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            classFrequencies[instanceClass] += weight;
            attWeight += weight;
            totalWeight += weight;
            if (!columnMemberships.next()) {
                // reached end of columnMemberships
                reachedEnd = true;
                if (att == nomVals.length - 1) {
                    // if the column contains no missing values, the last possible nominal value is
                    // not the missing value and therefore branchContainsMissingValues needs to be false
                    branchContainsMissingValues = branchContainsMissingValues && true;
                }
                break;
            }
        }
        start = end;
        if (attWeight < EPSILON) {
            // attribute value did not occur in this branch or sample
            continue;
        }
        final double[] classProbabilities = new double[targetVals.length];
        for (int i = 0; i < classProbabilities.length; i++) {
            classProbabilities[i] = truncateDouble(8, classFrequencies[i] / attWeight);
        }
        CombinedAttributeValues attVal = new CombinedAttributeValues(classFrequencies, classProbabilities, attWeight, nomVals[att]);
        ClassProbabilityVector classProbabilityVector = new ClassProbabilityVector(classProbabilities);
        CombinedAttributeValues knownAttVal = combinedAttValsMap.get(classProbabilityVector);
        if (knownAttVal == null) {
            combinedAttValsMap.put(classProbabilityVector, attVal);
        } else {
            knownAttVal.combineAttributeValues(attVal);
        }
        if (reachedEnd) {
            break;
        }
    }
    // account for missing values and their weight
    double missingWeight = 0.0;
    double[] missingClassCounts = null;
    // otherwise the current indexInColumn won't be larger than start
    if (columnMemberships.getIndexInColumn() >= start) {
        missingClassCounts = new double[targetVals.length];
        do {
            final double recordWeight = columnMemberships.getRowWeight();
            final int recordClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            missingWeight += recordWeight;
            missingClassCounts[recordClass] += recordWeight;
        } while (columnMemberships.next());
    }
    if (missingWeight > EPSILON) {
        branchContainsMissingValues = true;
    } else {
        branchContainsMissingValues = false;
    }
    ArrayList<CombinedAttributeValues> attValList = Lists.newArrayList(combinedAttValsMap.values());
    CombinedAttributeValues[] attVals = combinedAttValsMap.values().toArray(new CombinedAttributeValues[combinedAttValsMap.size()]);
    attVals = BinaryNominalSplitsPCA.calculatePCAOrdering(attVals, totalWeight, targetVals.length);
    // EigenDecomposition failed
    if (attVals == null) {
        return null;
    }
    // Start searching for split candidates
    final int highestBitPosition = containsMissingValues() ? nomVals.length - 2 : nomVals.length - 1;
    final double[] binaryImpurityValues = new double[2];
    final double[] binaryPartitionWeights = new double[2];
    double sumRemainingWeights = totalWeight;
    double sumCurrPartitionWeight = 0.0;
    RealVector targetFrequenciesCurrentPartition = MatrixUtils.createRealVector(new double[targetVals.length]);
    RealVector targetFrequenciesRemaining = MatrixUtils.createRealVector(new double[targetVals.length]);
    for (CombinedAttributeValues attVal : attValList) {
        targetFrequenciesRemaining = targetFrequenciesRemaining.add(attVal.m_classFrequencyVector);
    }
    BigInteger currPartitionBitMask = BigInteger.ZERO;
    double bestPartitionGain = Double.NEGATIVE_INFINITY;
    BigInteger bestPartitionMask = null;
    boolean isBestSplitValid = false;
    boolean missingsGoLeft = false;
    final double priorImpurity = useXGBoostMissingValueHandling ? targetPriors.getPriorImpurity() : impCriterion.getPartitionImpurity(subtractMissingClassCounts(targetPriors.getDistribution(), missingClassCounts), totalWeight);
    // no need to iterate over full list because at least one value must remain on the other side of the split
    for (int i = 0; i < attVals.length - 1; i++) {
        CombinedAttributeValues currAttVal = attVals[i];
        sumCurrPartitionWeight += currAttVal.m_totalWeight;
        sumRemainingWeights -= currAttVal.m_totalWeight;
        assert sumCurrPartitionWeight + sumRemainingWeights == totalWeight : "The weights of the partitions do not sum up to the total weight.";
        targetFrequenciesCurrentPartition = targetFrequenciesCurrentPartition.add(currAttVal.m_classFrequencyVector);
        targetFrequenciesRemaining = targetFrequenciesRemaining.subtract(currAttVal.m_classFrequencyVector);
        currPartitionBitMask = currPartitionBitMask.or(currAttVal.m_bitMask);
        boolean partitionIsRightBranch = currPartitionBitMask.testBit(highestBitPosition);
        boolean isValidSplit;
        double gain;
        boolean tempMissingsGoLeft = true;
        if (branchContainsMissingValues && useXGBoostMissingValueHandling) {
            // send missing values with partition
            boolean isValidSplitFirst = sumCurrPartitionWeight + missingWeight >= minChildSize && sumRemainingWeights >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(addMissingClassCounts(targetFrequenciesCurrentPartition.toArray(), missingClassCounts), sumCurrPartitionWeight + missingWeight);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetFrequenciesRemaining.toArray(), sumRemainingWeights);
            binaryPartitionWeights[0] = sumCurrPartitionWeight + missingWeight;
            binaryPartitionWeights[1] = sumRemainingWeights;
            double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
            double gainFirst = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
            // send missing values with remaining
            boolean isValidSplitSecond = sumCurrPartitionWeight >= minChildSize && sumRemainingWeights + missingWeight >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetFrequenciesCurrentPartition.toArray(), sumCurrPartitionWeight);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(addMissingClassCounts(targetFrequenciesRemaining.toArray(), missingClassCounts), sumRemainingWeights + missingWeight);
            binaryPartitionWeights[0] = sumCurrPartitionWeight;
            binaryPartitionWeights[1] = sumRemainingWeights + missingWeight;
            postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
            double gainSecond = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
            // choose alternative with better gain
            if (gainFirst >= gainSecond) {
                gain = gainFirst;
                isValidSplit = isValidSplitFirst;
                tempMissingsGoLeft = !partitionIsRightBranch;
            } else {
                gain = gainSecond;
                isValidSplit = isValidSplitSecond;
                tempMissingsGoLeft = partitionIsRightBranch;
            }
        } else {
            // TODO if invalid splits should not be considered skip partition
            isValidSplit = sumCurrPartitionWeight >= minChildSize && sumRemainingWeights >= minChildSize;
            binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetFrequenciesCurrentPartition.toArray(), sumCurrPartitionWeight);
            binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetFrequenciesRemaining.toArray(), sumRemainingWeights);
            binaryPartitionWeights[0] = sumCurrPartitionWeight;
            binaryPartitionWeights[1] = sumRemainingWeights;
            double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
            gain = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight);
        }
        // use random tie breaker if gains are equal
        boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
        // store if better than before or first valid split
        if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
            if (isValidSplit || !isBestSplitValid) {
                bestPartitionGain = gain;
                bestPartitionMask = partitionIsRightBranch ? currPartitionBitMask : BigInteger.ZERO.setBit(highestBitPosition + 1).subtract(BigInteger.ONE).xor(currPartitionBitMask);
                isBestSplitValid = isValidSplit;
                if (branchContainsMissingValues) {
                    missingsGoLeft = tempMissingsGoLeft;
                // missing values are encountered during the search for the best split
                // missingsGoLeft = partitionIsRightBranch;
                } else {
                    // no missing values were encountered during the search for the best split
                    // missing values should be sent with the majority
                    missingsGoLeft = partitionIsRightBranch ? sumCurrPartitionWeight < sumRemainingWeights : sumCurrPartitionWeight >= sumRemainingWeights;
                }
            }
        }
    }
    if (isBestSplitValid && bestPartitionGain > 0.0) {
        if (useXGBoostMissingValueHandling) {
            return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, NO_MISSED_ROWS, missingsGoLeft ? NominalBinarySplitCandidate.MISSINGS_GO_LEFT : NominalBinarySplitCandidate.MISSINGS_GO_RIGHT);
        }
        return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
    }
    return null;
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) LinkedHashMap(java.util.LinkedHashMap) RealVector(org.apache.commons.math3.linear.RealVector) BigInteger(java.math.BigInteger) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) CombinedAttributeValues(org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues)

Example 2 with NominalBinarySplitCandidate

use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.

the class TreeNominalColumnData method calcBestSplitClassificationBinary.

NominalBinarySplitCandidate calcBestSplitClassificationBinary(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
    if (nomVals.length <= 1) {
        return null;
    }
    final int minChildSize = getConfiguration().getMinChildSize();
    final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
    // distribution of target for each attribute value
    final double[][] targetCountsSplitPerAttribute = new double[lengthNonMissing][targetVals.length];
    // number of valid records for each attribute value
    final double[] attWeights = new double[lengthNonMissing];
    // number (sum) of total valid values
    double totalWeight = 0.0;
    int start = 0;
    columnMemberships.next();
    for (int att = 0; att < lengthNonMissing; att++) {
        final int end = start + m_nominalValueCounts[att];
        double currentAttValWeight = 0.0;
        for (int index = columnMemberships.getIndexInColumn(); index < end; columnMemberships.next(), index = columnMemberships.getIndexInColumn()) {
            final double weight = columnMemberships.getRowWeight();
            assert weight > EPSILON : "The usage of datamemberships should ensure that no rows with zero weight are encountered";
            int target = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            targetCountsSplitPerAttribute[att][target] += weight;
            currentAttValWeight += weight;
        }
        totalWeight += currentAttValWeight;
        attWeights[att] = currentAttValWeight;
        start = end;
    }
    BinarySplitEnumeration splitEnumeration;
    if (nomVals.length <= 10) {
        splitEnumeration = new FullBinarySplitEnumeration(nomVals.length);
    } else {
        int maxSearch = (1 << 10 - 2);
        splitEnumeration = new RandomBinarySplitEnumeration(nomVals.length, maxSearch, rd);
    }
    BigInteger bestPartitionMask = null;
    boolean isBestSplitValid = false;
    double bestPartitionGain = Double.NEGATIVE_INFINITY;
    final double[] targetCountsSplitLeft = new double[targetVals.length];
    final double[] targetCountsSplitRight = new double[targetVals.length];
    final double[] binaryImpurityValues = new double[2];
    final double[] binaryPartitionWeights = new double[2];
    do {
        Arrays.fill(targetCountsSplitLeft, 0.0);
        Arrays.fill(targetCountsSplitRight, 0.0);
        double weightLeft = 0.0;
        double weightRight = 0.0;
        for (int i = 0; i < nomVals.length; i++) {
            final boolean isAttributeInRightBranch = splitEnumeration.isInRightBranch(i);
            double[] targetCountsCurrentAttribute = targetCountsSplitPerAttribute[i];
            for (int targetVal = 0; targetVal < targetVals.length; targetVal++) {
                if (isAttributeInRightBranch) {
                    targetCountsSplitRight[targetVal] += targetCountsCurrentAttribute[targetVal];
                } else {
                    targetCountsSplitLeft[targetVal] += targetCountsCurrentAttribute[targetVal];
                }
            }
            if (isAttributeInRightBranch) {
                weightRight += attWeights[i];
            } else {
                weightLeft += attWeights[i];
            }
        }
        binaryPartitionWeights[0] = weightRight;
        binaryPartitionWeights[1] = weightLeft;
        boolean isValidSplit = weightRight >= minChildSize && weightLeft >= minChildSize;
        binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetCountsSplitRight, weightRight);
        binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetCountsSplitLeft, weightLeft);
        double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
        double gain = impCriterion.getGain(targetPriors.getPriorImpurity(), postSplitImpurity, binaryPartitionWeights, totalWeight);
        // use random tie breaker if gains are equal
        boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
        // store if better than before or first valid split
        if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
            if (isValidSplit || !isBestSplitValid) {
                bestPartitionGain = gain;
                bestPartitionMask = splitEnumeration.getValueMask();
                isBestSplitValid = isValidSplit;
            }
        }
    } while (splitEnumeration.next());
    if (bestPartitionGain > 0.0) {
        return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
    }
    return null;
}
Also used : BigInteger(java.math.BigInteger) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)

Example 3 with NominalBinarySplitCandidate

use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.

the class TreeNominalColumnData method calcBestSplitRegressionBinaryBreiman.

/**
 * If an attribute value does not appear in the current branch, it is not guaranteed in which child branch this
 * value will fall. (This should not be a problem since we cannot make any assumptions about this attribute value
 * anyway)
 *
 * @param membershipController
 * @param rowWeights
 * @param targetPriors
 * @param targetColumn
 * @param nomVals
 * @param originalIndexInColumnList
 * @return best split candidate or null if there is no split candidate with positive gain or too small child nodes
 */
private NominalBinarySplitCandidate calcBestSplitRegressionBinaryBreiman(final ColumnMemberships columnMemberships, final RegressionPriors targetPriors, final TreeTargetNumericColumnData targetColumn, final NominalValueRepresentation[] nomVals, final RandomData rd) {
    final int minChildSize = getConfiguration().getMinChildSize();
    double sumYTotal = targetPriors.getYSum();
    double sumWeightTotal = targetPriors.getNrRecords();
    final boolean useXGBoostMissingValueHandling = getConfiguration().getMissingValueHandling() == MissingValueHandling.XGBoost;
    boolean branchContainsMissingValues = containsMissingValues();
    double missingWeight = 0.0;
    double missingY = 0.0;
    if (branchContainsMissingValues) {
        columnMemberships.goToLast();
        while (columnMemberships.getIndexInColumn() >= m_idxOfFirstMissing) {
            final double weight = columnMemberships.getRowWeight();
            missingWeight += weight;
            missingY += weight * targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            if (!columnMemberships.previous()) {
                break;
            }
        }
        sumYTotal -= missingY;
        sumWeightTotal -= missingWeight;
        branchContainsMissingValues = missingWeight > 0.0;
        columnMemberships.reset();
    }
    final double criterionTotal;
    if (useXGBoostMissingValueHandling) {
        criterionTotal = (sumYTotal + missingY) * (sumYTotal + missingY) / (sumWeightTotal + missingWeight);
    } else {
        criterionTotal = sumYTotal + sumYTotal / sumWeightTotal;
    }
    final ArrayList<AttValTupleRegression> attValList = Lists.newArrayList();
    columnMemberships.next();
    int start = 0;
    final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
    for (int att = 0; att < lengthNonMissing; att++) {
        double sumY = 0.0;
        double sumWeight = 0.0;
        int end = start + m_nominalValueCounts[att];
        boolean reachedEnd = false;
        for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
            double weight = columnMemberships.getRowWeight();
            assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
            sumY += targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            sumWeight += weight;
            if (!columnMemberships.next()) {
                reachedEnd = true;
                break;
            }
        }
        start = end;
        if (sumWeight < EPSILON) {
            // we cannot make any assumptions about this attribute value
            continue;
        }
        attValList.add(new AttValTupleRegression(sumY, sumWeight, sumY / sumWeight, nomVals[att]));
        if (reachedEnd) {
            break;
        }
    }
    assert sumWeights(attValList) == sumWeightTotal : "The weights of the attribute values does not sum up to the total weight";
    // sort attribute values according to their mean Y value
    attValList.sort(null);
    BigInteger bestPartitionMask = null;
    boolean isBestSplitValid = false;
    double bestPartitionGain = Double.NEGATIVE_INFINITY;
    final int highestBitPosition = containsMissingValues() ? nomVals.length - 2 : nomVals.length - 1;
    double sumYPartition = 0.0;
    double sumWeightPartition = 0.0;
    BigInteger partitionMask = BigInteger.ZERO;
    double sumYRemaining = sumYTotal;
    double sumWeightRemaining = sumWeightTotal;
    boolean missingsGoLeft = true;
    // no need to iterate over full list because at least one value must remain on the other side of the split
    for (int i = 0; i < attValList.size() - 1; i++) {
        AttValTupleRegression attVal = attValList.get(i);
        sumYPartition += attVal.m_sumY;
        sumWeightPartition += attVal.m_sumWeight;
        sumYRemaining -= attVal.m_sumY;
        sumWeightRemaining -= attVal.m_sumWeight;
        assert AbsIsSmallerEpsilon(sumWeightTotal - sumWeightRemaining - sumWeightPartition) : "The weights left and right of the split do not add up to the total weight.";
        assert sumWeightPartition > 0.0 : "The weight of the partition is zero.";
        assert sumWeightRemaining > 0.0 : "The weight of the remaining is zero.";
        partitionMask = partitionMask.or(attVal.m_bitMask);
        double gain;
        boolean isValidSplit;
        boolean tempMissingsGoLeft = true;
        if (branchContainsMissingValues && useXGBoostMissingValueHandling) {
            boolean isValidSplitPartitionWithMissing = sumWeightPartition + missingWeight >= minChildSize && sumWeightRemaining >= minChildSize;
            double sumYMissingWithPartition = sumYPartition + missingY;
            double gainMissingWithPartition = sumYMissingWithPartition * sumYMissingWithPartition / (sumWeightPartition + missingWeight) + sumYRemaining * sumYRemaining / sumWeightRemaining - criterionTotal;
            boolean isValidSplitRemainingWithMissing = sumWeightPartition >= minChildSize && sumWeightRemaining + missingWeight >= minChildSize;
            double sumYMissingWithRemaining = sumYRemaining + missingY;
            double gainMissingWithRemaining = sumYPartition * sumYPartition / sumWeightPartition + sumYMissingWithRemaining * sumYMissingWithRemaining / (sumWeightRemaining + missingWeight) - criterionTotal;
            if (gainMissingWithPartition >= gainMissingWithRemaining) {
                gain = gainMissingWithPartition;
                isValidSplit = isValidSplitPartitionWithMissing;
                tempMissingsGoLeft = !partitionMask.testBit(highestBitPosition);
            } else {
                gain = gainMissingWithRemaining;
                isValidSplit = isValidSplitRemainingWithMissing;
                tempMissingsGoLeft = partitionMask.testBit(highestBitPosition);
            }
        } else {
            isValidSplit = sumWeightPartition >= minChildSize && sumWeightRemaining >= minChildSize;
            gain = sumYPartition * sumYPartition / sumWeightPartition + sumYRemaining * sumYRemaining / sumWeightRemaining - criterionTotal;
        }
        // use random tie breaker if gains are equal
        boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
        // store if better than before or first valid split
        if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
            if (isValidSplit || !isBestSplitValid) {
                bestPartitionGain = gain;
                // right branch must by convention always contain the nominal value
                // with the highest assigned integer
                bestPartitionMask = partitionMask.testBit(highestBitPosition) ? partitionMask : BigInteger.ZERO.setBit(highestBitPosition + 1).subtract(BigInteger.ONE).xor(partitionMask);
                isBestSplitValid = isValidSplit;
                if (branchContainsMissingValues) {
                    missingsGoLeft = tempMissingsGoLeft;
                } else {
                    // no missings in this branch, but we still have to provide a direction for missing values
                    // send missings in the direction the most records in the node are sent to
                    boolean sendWithPartition = sumWeightPartition >= sumWeightRemaining;
                    missingsGoLeft = sendWithPartition ? !partitionMask.testBit(highestBitPosition) : partitionMask.testBit(highestBitPosition);
                }
            }
        }
    }
    if (bestPartitionGain > 0.0 && isBestSplitValid) {
        if (useXGBoostMissingValueHandling) {
            return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, NO_MISSED_ROWS, missingsGoLeft ? NominalBinarySplitCandidate.MISSINGS_GO_LEFT : NominalBinarySplitCandidate.MISSINGS_GO_RIGHT);
        }
        return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
    }
    return null;
}
Also used : BigInteger(java.math.BigInteger) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)

Example 4 with NominalBinarySplitCandidate

use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.

the class TreeNominalColumnData method calcBestSplitRegressionBinary.

private NominalBinarySplitCandidate calcBestSplitRegressionBinary(final ColumnMemberships columnMemberships, final RegressionPriors targetPriors, final TreeTargetNumericColumnData targetColumn, final NominalValueRepresentation[] nomVals, final RandomData rd) {
    final int minChildSize = getConfiguration().getMinChildSize();
    final double ySumTotal = targetPriors.getYSum();
    final double nrRecordsTotal = targetPriors.getNrRecords();
    final double criterionTotal = ySumTotal * ySumTotal / nrRecordsTotal;
    final double[] ySums = new double[nomVals.length];
    final double[] sumWeightsAttributes = new double[nomVals.length];
    columnMemberships.next();
    int start = 0;
    for (int att = 0; att < nomVals.length; att++) {
        int end = start + m_nominalValueCounts[att];
        double weightSum = 0.0;
        double ySum = 0.0;
        boolean reachedEnd = false;
        for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
            final double weight = columnMemberships.getRowWeight();
            assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
            ySum += weight * targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            weightSum += weight;
            if (!columnMemberships.next()) {
                // reached end of columnMemberships
                reachedEnd = true;
                break;
            }
        }
        sumWeightsAttributes[att] = weightSum;
        ySums[att] = ySum;
        start = end;
        if (reachedEnd) {
            break;
        }
    }
    BinarySplitEnumeration splitEnumeration;
    if (nomVals.length <= 10) {
        splitEnumeration = new FullBinarySplitEnumeration(nomVals.length);
    } else {
        int maxSearch = (1 << 10 - 2);
        splitEnumeration = new RandomBinarySplitEnumeration(nomVals.length, maxSearch, rd);
    }
    BigInteger bestPartitionMask = null;
    boolean isBestSplitValid = false;
    double bestPartitionGain = Double.NEGATIVE_INFINITY;
    do {
        double weightLeft = 0.0;
        double ySumLeft = 0.0;
        double weightRight = 0.0;
        double ySumRight = 0.0;
        for (int i = 0; i < nomVals.length; i++) {
            final boolean isAttributeInRightBranch = splitEnumeration.isInRightBranch(i);
            if (isAttributeInRightBranch) {
                weightRight += sumWeightsAttributes[i];
                ySumRight += ySums[i];
            } else {
                weightLeft += sumWeightsAttributes[i];
                ySumLeft += ySums[i];
            }
        }
        final boolean isValidSplit = weightRight >= minChildSize && weightLeft >= minChildSize;
        double gain = ySumRight * ySumRight / weightRight + ySumLeft * ySumLeft / weightLeft - criterionTotal;
        // use random tie breaker if gains are equal
        boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
        // store if better than before or first valid split
        if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
            if (isValidSplit || !isBestSplitValid) {
                bestPartitionGain = gain;
                bestPartitionMask = splitEnumeration.getValueMask();
                isBestSplitValid = isValidSplit;
            }
        }
    } while (splitEnumeration.next());
    if (bestPartitionGain > 0.0) {
        return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
    }
    return null;
}
Also used : BigInteger(java.math.BigInteger) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)

Example 5 with NominalBinarySplitCandidate

use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.

the class TreeNominalColumnDataTest method testCalcBestSplitRegressionBinaryXGBoostMissingValueHandling.

/**
 * Tests the XGBoost missing value handling in case of a regression with binary splits.
 *
 * @throws Exception
 */
@Test
public void testCalcBestSplitRegressionBinaryXGBoostMissingValueHandling() throws Exception {
    final TreeEnsembleLearnerConfiguration config = createConfig(true);
    config.setMissingValueHandling(MissingValueHandling.XGBoost);
    final TestDataGenerator dataGen = new TestDataGenerator(config);
    final String noMissingCSV = "A, A, A, B, B, B, B, C, C";
    final String noMissingsTarget = "1, 2, 2, 7, 6, 5, 2, 3, 1";
    TreeNominalColumnData dataCol = dataGen.createNominalAttributeColumn(noMissingCSV, "noMissings", 0);
    TreeTargetNumericColumnData targetCol = TestDataGenerator.createNumericTargetColumn(noMissingsTarget);
    double[] weights = new double[9];
    Arrays.fill(weights, 1.0);
    int[] indices = new int[9];
    for (int i = 0; i < indices.length; i++) {
        indices[i] = i;
    }
    final RandomData rd = config.createRandomData();
    DataMemberships dataMemberships = new MockDataColMem(indices, indices, weights);
    // first test the case that there are no missing values during training (we still need to provide a missing value direction for prediction)
    SplitCandidate split = dataCol.calcBestSplitRegression(dataMemberships, targetCol.getPriors(weights, config), targetCol, rd);
    assertNotNull("SplitCandidate may not be null", split);
    assertThat(split, instanceOf(NominalBinarySplitCandidate.class));
    assertEquals("Wrong gain.", 22.755555, split.getGainValue(), 1e-5);
    assertTrue("No missing values in dataCol therefore the missedRows BitSet must be empty.", split.getMissedRows().isEmpty());
    NominalBinarySplitCandidate nomSplit = (NominalBinarySplitCandidate) split;
    TreeNodeNominalBinaryCondition[] conditions = nomSplit.getChildConditions();
    assertEquals("Binary split candidate must have two children.", 2, conditions.length);
    final String[] values = new String[] { "A", "C" };
    assertArrayEquals("Wrong values in split condition.", values, conditions[0].getValues());
    assertArrayEquals("Wrong values in split condition.", values, conditions[1].getValues());
    assertFalse("Missings should go with majority", conditions[0].acceptsMissings());
    assertTrue("Missings should go with majority", conditions[1].acceptsMissings());
    assertEquals("Wrong set logic.", SetLogic.IS_NOT_IN, conditions[0].getSetLogic());
    assertEquals("Wrong set logic.", SetLogic.IS_IN, conditions[1].getSetLogic());
    // test the case that there are missing values during training
    final String missingCSV = "A, A, A, B, B, B, B, C, C, ?";
    final String missingTarget = "1, 2, 2, 7, 6, 5, 2, 3, 1, 8";
    dataCol = dataGen.createNominalAttributeColumn(missingCSV, "missing", 0);
    targetCol = TestDataGenerator.createNumericTargetColumn(missingTarget);
    weights = new double[10];
    Arrays.fill(weights, 1.0);
    indices = new int[10];
    for (int i = 0; i < indices.length; i++) {
        indices[i] = i;
    }
    dataMemberships = new MockDataColMem(indices, indices, weights);
    split = dataCol.calcBestSplitRegression(dataMemberships, targetCol.getPriors(weights, config), targetCol, rd);
    assertNotNull("SplitCandidate may not be null.", split);
    assertThat(split, instanceOf(NominalBinarySplitCandidate.class));
    assertEquals("Wrong gain.", 36.1, split.getGainValue(), 1e-5);
    assertTrue("Conditions should handle missing values therefore the missedRows BitSet must be empty.", split.getMissedRows().isEmpty());
    nomSplit = (NominalBinarySplitCandidate) split;
    conditions = nomSplit.getChildConditions();
    assertEquals("Binary split candidate must have two children.", 2, conditions.length);
    assertArrayEquals("Wrong values in split condition.", values, conditions[0].getValues());
    assertArrayEquals("Wrong values in split condition.", values, conditions[1].getValues());
    assertTrue("Missings should go with B (because there target values are similar)", conditions[0].acceptsMissings());
    assertFalse("Missings should go with B (because there target values are similar)", conditions[1].acceptsMissings());
    assertEquals("Wrong set logic.", SetLogic.IS_NOT_IN, conditions[0].getSetLogic());
    assertEquals("Wrong set logic.", SetLogic.IS_IN, conditions[1].getSetLogic());
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RandomData(org.apache.commons.math.random.RandomData) NominalMultiwaySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) SplitCandidate(org.knime.base.node.mine.treeensemble2.learner.SplitCandidate) DataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) TreeNodeNominalBinaryCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) Test(org.junit.Test)

Aggregations

NominalBinarySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)13 TreeEnsembleLearnerConfiguration (org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration)10 DataMemberships (org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships)8 RootDataMemberships (org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships)8 NominalMultiwaySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate)8 SplitCandidate (org.knime.base.node.mine.treeensemble2.learner.SplitCandidate)8 TreeNodeNominalBinaryCondition (org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition)8 Test (org.junit.Test)7 BigInteger (java.math.BigInteger)5 DefaultDataIndexManager (org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager)5 IDataIndexManager (org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager)5 RandomData (org.apache.commons.math.random.RandomData)3 ArrayList (java.util.ArrayList)1 BitSet (java.util.BitSet)1 LinkedHashMap (java.util.LinkedHashMap)1 RealVector (org.apache.commons.math3.linear.RealVector)1 CombinedAttributeValues (org.knime.base.node.mine.treeensemble2.data.BinaryNominalSplitsPCA.CombinedAttributeValues)1