use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.
the class TreeNominalColumnData method calcBestSplitClassificationBinaryPCA.
/**
* Implements the approach proposed by Coppersmith et al. (1999) in their paper
* "Partitioning Nominal Attributes in Decision Trees"
*
* @param membershipController
* @param rowWeights
* @param targetPriors
* @param targetColumn
* @param impCriterion
* @param nomVals
* @param targetVals
* @param originalIndexInColumnList
* @return the best binary split candidate or null if there is no valid split with positive gain
*/
private NominalBinarySplitCandidate calcBestSplitClassificationBinaryPCA(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
final TreeEnsembleLearnerConfiguration config = getConfiguration();
final int minChildSize = config.getMinChildSize();
final boolean useXGBoostMissingValueHandling = config.getMissingValueHandling() == MissingValueHandling.XGBoost;
// The algorithm combines attribute values with the same class probabilities into a single attribute
// therefore it is necessary to track the known classProbabilities
final LinkedHashMap<ClassProbabilityVector, CombinedAttributeValues> combinedAttValsMap = new LinkedHashMap<ClassProbabilityVector, CombinedAttributeValues>();
columnMemberships.next();
double totalWeight = 0.0;
boolean branchContainsMissingValues = containsMissingValues();
int start = 0;
final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
final int attToConsider = useXGBoostMissingValueHandling ? nomVals.length : lengthNonMissing;
for (int att = 0; att < lengthNonMissing; /*attToConsider*/
att++) {
int end = start + m_nominalValueCounts[att];
double attWeight = 0.0;
final double[] classFrequencies = new double[targetVals.length];
boolean reachedEnd = false;
for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
double weight = columnMemberships.getRowWeight();
assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
int instanceClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
classFrequencies[instanceClass] += weight;
attWeight += weight;
totalWeight += weight;
if (!columnMemberships.next()) {
// reached end of columnMemberships
reachedEnd = true;
if (att == nomVals.length - 1) {
// if the column contains no missing values, the last possible nominal value is
// not the missing value and therefore branchContainsMissingValues needs to be false
branchContainsMissingValues = branchContainsMissingValues && true;
}
break;
}
}
start = end;
if (attWeight < EPSILON) {
// attribute value did not occur in this branch or sample
continue;
}
final double[] classProbabilities = new double[targetVals.length];
for (int i = 0; i < classProbabilities.length; i++) {
classProbabilities[i] = truncateDouble(8, classFrequencies[i] / attWeight);
}
CombinedAttributeValues attVal = new CombinedAttributeValues(classFrequencies, classProbabilities, attWeight, nomVals[att]);
ClassProbabilityVector classProbabilityVector = new ClassProbabilityVector(classProbabilities);
CombinedAttributeValues knownAttVal = combinedAttValsMap.get(classProbabilityVector);
if (knownAttVal == null) {
combinedAttValsMap.put(classProbabilityVector, attVal);
} else {
knownAttVal.combineAttributeValues(attVal);
}
if (reachedEnd) {
break;
}
}
// account for missing values and their weight
double missingWeight = 0.0;
double[] missingClassCounts = null;
// otherwise the current indexInColumn won't be larger than start
if (columnMemberships.getIndexInColumn() >= start) {
missingClassCounts = new double[targetVals.length];
do {
final double recordWeight = columnMemberships.getRowWeight();
final int recordClass = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
missingWeight += recordWeight;
missingClassCounts[recordClass] += recordWeight;
} while (columnMemberships.next());
}
if (missingWeight > EPSILON) {
branchContainsMissingValues = true;
} else {
branchContainsMissingValues = false;
}
ArrayList<CombinedAttributeValues> attValList = Lists.newArrayList(combinedAttValsMap.values());
CombinedAttributeValues[] attVals = combinedAttValsMap.values().toArray(new CombinedAttributeValues[combinedAttValsMap.size()]);
attVals = BinaryNominalSplitsPCA.calculatePCAOrdering(attVals, totalWeight, targetVals.length);
// EigenDecomposition failed
if (attVals == null) {
return null;
}
// Start searching for split candidates
final int highestBitPosition = containsMissingValues() ? nomVals.length - 2 : nomVals.length - 1;
final double[] binaryImpurityValues = new double[2];
final double[] binaryPartitionWeights = new double[2];
double sumRemainingWeights = totalWeight;
double sumCurrPartitionWeight = 0.0;
RealVector targetFrequenciesCurrentPartition = MatrixUtils.createRealVector(new double[targetVals.length]);
RealVector targetFrequenciesRemaining = MatrixUtils.createRealVector(new double[targetVals.length]);
for (CombinedAttributeValues attVal : attValList) {
targetFrequenciesRemaining = targetFrequenciesRemaining.add(attVal.m_classFrequencyVector);
}
BigInteger currPartitionBitMask = BigInteger.ZERO;
double bestPartitionGain = Double.NEGATIVE_INFINITY;
BigInteger bestPartitionMask = null;
boolean isBestSplitValid = false;
boolean missingsGoLeft = false;
final double priorImpurity = useXGBoostMissingValueHandling ? targetPriors.getPriorImpurity() : impCriterion.getPartitionImpurity(subtractMissingClassCounts(targetPriors.getDistribution(), missingClassCounts), totalWeight);
// no need to iterate over full list because at least one value must remain on the other side of the split
for (int i = 0; i < attVals.length - 1; i++) {
CombinedAttributeValues currAttVal = attVals[i];
sumCurrPartitionWeight += currAttVal.m_totalWeight;
sumRemainingWeights -= currAttVal.m_totalWeight;
assert sumCurrPartitionWeight + sumRemainingWeights == totalWeight : "The weights of the partitions do not sum up to the total weight.";
targetFrequenciesCurrentPartition = targetFrequenciesCurrentPartition.add(currAttVal.m_classFrequencyVector);
targetFrequenciesRemaining = targetFrequenciesRemaining.subtract(currAttVal.m_classFrequencyVector);
currPartitionBitMask = currPartitionBitMask.or(currAttVal.m_bitMask);
boolean partitionIsRightBranch = currPartitionBitMask.testBit(highestBitPosition);
boolean isValidSplit;
double gain;
boolean tempMissingsGoLeft = true;
if (branchContainsMissingValues && useXGBoostMissingValueHandling) {
// send missing values with partition
boolean isValidSplitFirst = sumCurrPartitionWeight + missingWeight >= minChildSize && sumRemainingWeights >= minChildSize;
binaryImpurityValues[0] = impCriterion.getPartitionImpurity(addMissingClassCounts(targetFrequenciesCurrentPartition.toArray(), missingClassCounts), sumCurrPartitionWeight + missingWeight);
binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetFrequenciesRemaining.toArray(), sumRemainingWeights);
binaryPartitionWeights[0] = sumCurrPartitionWeight + missingWeight;
binaryPartitionWeights[1] = sumRemainingWeights;
double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
double gainFirst = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
// send missing values with remaining
boolean isValidSplitSecond = sumCurrPartitionWeight >= minChildSize && sumRemainingWeights + missingWeight >= minChildSize;
binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetFrequenciesCurrentPartition.toArray(), sumCurrPartitionWeight);
binaryImpurityValues[1] = impCriterion.getPartitionImpurity(addMissingClassCounts(targetFrequenciesRemaining.toArray(), missingClassCounts), sumRemainingWeights + missingWeight);
binaryPartitionWeights[0] = sumCurrPartitionWeight;
binaryPartitionWeights[1] = sumRemainingWeights + missingWeight;
postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight + missingWeight);
double gainSecond = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight + missingWeight);
// choose alternative with better gain
if (gainFirst >= gainSecond) {
gain = gainFirst;
isValidSplit = isValidSplitFirst;
tempMissingsGoLeft = !partitionIsRightBranch;
} else {
gain = gainSecond;
isValidSplit = isValidSplitSecond;
tempMissingsGoLeft = partitionIsRightBranch;
}
} else {
// TODO if invalid splits should not be considered skip partition
isValidSplit = sumCurrPartitionWeight >= minChildSize && sumRemainingWeights >= minChildSize;
binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetFrequenciesCurrentPartition.toArray(), sumCurrPartitionWeight);
binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetFrequenciesRemaining.toArray(), sumRemainingWeights);
binaryPartitionWeights[0] = sumCurrPartitionWeight;
binaryPartitionWeights[1] = sumRemainingWeights;
double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
gain = impCriterion.getGain(priorImpurity, postSplitImpurity, binaryPartitionWeights, totalWeight);
}
// use random tie breaker if gains are equal
boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
// store if better than before or first valid split
if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
if (isValidSplit || !isBestSplitValid) {
bestPartitionGain = gain;
bestPartitionMask = partitionIsRightBranch ? currPartitionBitMask : BigInteger.ZERO.setBit(highestBitPosition + 1).subtract(BigInteger.ONE).xor(currPartitionBitMask);
isBestSplitValid = isValidSplit;
if (branchContainsMissingValues) {
missingsGoLeft = tempMissingsGoLeft;
// missing values are encountered during the search for the best split
// missingsGoLeft = partitionIsRightBranch;
} else {
// no missing values were encountered during the search for the best split
// missing values should be sent with the majority
missingsGoLeft = partitionIsRightBranch ? sumCurrPartitionWeight < sumRemainingWeights : sumCurrPartitionWeight >= sumRemainingWeights;
}
}
}
}
if (isBestSplitValid && bestPartitionGain > 0.0) {
if (useXGBoostMissingValueHandling) {
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, NO_MISSED_ROWS, missingsGoLeft ? NominalBinarySplitCandidate.MISSINGS_GO_LEFT : NominalBinarySplitCandidate.MISSINGS_GO_RIGHT);
}
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
}
return null;
}
use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.
the class TreeNominalColumnData method calcBestSplitClassificationBinary.
NominalBinarySplitCandidate calcBestSplitClassificationBinary(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
if (nomVals.length <= 1) {
return null;
}
final int minChildSize = getConfiguration().getMinChildSize();
final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
// distribution of target for each attribute value
final double[][] targetCountsSplitPerAttribute = new double[lengthNonMissing][targetVals.length];
// number of valid records for each attribute value
final double[] attWeights = new double[lengthNonMissing];
// number (sum) of total valid values
double totalWeight = 0.0;
int start = 0;
columnMemberships.next();
for (int att = 0; att < lengthNonMissing; att++) {
final int end = start + m_nominalValueCounts[att];
double currentAttValWeight = 0.0;
for (int index = columnMemberships.getIndexInColumn(); index < end; columnMemberships.next(), index = columnMemberships.getIndexInColumn()) {
final double weight = columnMemberships.getRowWeight();
assert weight > EPSILON : "The usage of datamemberships should ensure that no rows with zero weight are encountered";
int target = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
targetCountsSplitPerAttribute[att][target] += weight;
currentAttValWeight += weight;
}
totalWeight += currentAttValWeight;
attWeights[att] = currentAttValWeight;
start = end;
}
BinarySplitEnumeration splitEnumeration;
if (nomVals.length <= 10) {
splitEnumeration = new FullBinarySplitEnumeration(nomVals.length);
} else {
int maxSearch = (1 << 10 - 2);
splitEnumeration = new RandomBinarySplitEnumeration(nomVals.length, maxSearch, rd);
}
BigInteger bestPartitionMask = null;
boolean isBestSplitValid = false;
double bestPartitionGain = Double.NEGATIVE_INFINITY;
final double[] targetCountsSplitLeft = new double[targetVals.length];
final double[] targetCountsSplitRight = new double[targetVals.length];
final double[] binaryImpurityValues = new double[2];
final double[] binaryPartitionWeights = new double[2];
do {
Arrays.fill(targetCountsSplitLeft, 0.0);
Arrays.fill(targetCountsSplitRight, 0.0);
double weightLeft = 0.0;
double weightRight = 0.0;
for (int i = 0; i < nomVals.length; i++) {
final boolean isAttributeInRightBranch = splitEnumeration.isInRightBranch(i);
double[] targetCountsCurrentAttribute = targetCountsSplitPerAttribute[i];
for (int targetVal = 0; targetVal < targetVals.length; targetVal++) {
if (isAttributeInRightBranch) {
targetCountsSplitRight[targetVal] += targetCountsCurrentAttribute[targetVal];
} else {
targetCountsSplitLeft[targetVal] += targetCountsCurrentAttribute[targetVal];
}
}
if (isAttributeInRightBranch) {
weightRight += attWeights[i];
} else {
weightLeft += attWeights[i];
}
}
binaryPartitionWeights[0] = weightRight;
binaryPartitionWeights[1] = weightLeft;
boolean isValidSplit = weightRight >= minChildSize && weightLeft >= minChildSize;
binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetCountsSplitRight, weightRight);
binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetCountsSplitLeft, weightLeft);
double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
double gain = impCriterion.getGain(targetPriors.getPriorImpurity(), postSplitImpurity, binaryPartitionWeights, totalWeight);
// use random tie breaker if gains are equal
boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
// store if better than before or first valid split
if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
if (isValidSplit || !isBestSplitValid) {
bestPartitionGain = gain;
bestPartitionMask = splitEnumeration.getValueMask();
isBestSplitValid = isValidSplit;
}
}
} while (splitEnumeration.next());
if (bestPartitionGain > 0.0) {
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
}
return null;
}
use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.
the class TreeNominalColumnData method calcBestSplitRegressionBinaryBreiman.
/**
* If an attribute value does not appear in the current branch, it is not guaranteed in which child branch this
* value will fall. (This should not be a problem since we cannot make any assumptions about this attribute value
* anyway)
*
* @param membershipController
* @param rowWeights
* @param targetPriors
* @param targetColumn
* @param nomVals
* @param originalIndexInColumnList
* @return best split candidate or null if there is no split candidate with positive gain or too small child nodes
*/
private NominalBinarySplitCandidate calcBestSplitRegressionBinaryBreiman(final ColumnMemberships columnMemberships, final RegressionPriors targetPriors, final TreeTargetNumericColumnData targetColumn, final NominalValueRepresentation[] nomVals, final RandomData rd) {
final int minChildSize = getConfiguration().getMinChildSize();
double sumYTotal = targetPriors.getYSum();
double sumWeightTotal = targetPriors.getNrRecords();
final boolean useXGBoostMissingValueHandling = getConfiguration().getMissingValueHandling() == MissingValueHandling.XGBoost;
boolean branchContainsMissingValues = containsMissingValues();
double missingWeight = 0.0;
double missingY = 0.0;
if (branchContainsMissingValues) {
columnMemberships.goToLast();
while (columnMemberships.getIndexInColumn() >= m_idxOfFirstMissing) {
final double weight = columnMemberships.getRowWeight();
missingWeight += weight;
missingY += weight * targetColumn.getValueFor(columnMemberships.getOriginalIndex());
if (!columnMemberships.previous()) {
break;
}
}
sumYTotal -= missingY;
sumWeightTotal -= missingWeight;
branchContainsMissingValues = missingWeight > 0.0;
columnMemberships.reset();
}
final double criterionTotal;
if (useXGBoostMissingValueHandling) {
criterionTotal = (sumYTotal + missingY) * (sumYTotal + missingY) / (sumWeightTotal + missingWeight);
} else {
criterionTotal = sumYTotal + sumYTotal / sumWeightTotal;
}
final ArrayList<AttValTupleRegression> attValList = Lists.newArrayList();
columnMemberships.next();
int start = 0;
final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
for (int att = 0; att < lengthNonMissing; att++) {
double sumY = 0.0;
double sumWeight = 0.0;
int end = start + m_nominalValueCounts[att];
boolean reachedEnd = false;
for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
double weight = columnMemberships.getRowWeight();
assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
sumY += targetColumn.getValueFor(columnMemberships.getOriginalIndex());
sumWeight += weight;
if (!columnMemberships.next()) {
reachedEnd = true;
break;
}
}
start = end;
if (sumWeight < EPSILON) {
// we cannot make any assumptions about this attribute value
continue;
}
attValList.add(new AttValTupleRegression(sumY, sumWeight, sumY / sumWeight, nomVals[att]));
if (reachedEnd) {
break;
}
}
assert sumWeights(attValList) == sumWeightTotal : "The weights of the attribute values does not sum up to the total weight";
// sort attribute values according to their mean Y value
attValList.sort(null);
BigInteger bestPartitionMask = null;
boolean isBestSplitValid = false;
double bestPartitionGain = Double.NEGATIVE_INFINITY;
final int highestBitPosition = containsMissingValues() ? nomVals.length - 2 : nomVals.length - 1;
double sumYPartition = 0.0;
double sumWeightPartition = 0.0;
BigInteger partitionMask = BigInteger.ZERO;
double sumYRemaining = sumYTotal;
double sumWeightRemaining = sumWeightTotal;
boolean missingsGoLeft = true;
// no need to iterate over full list because at least one value must remain on the other side of the split
for (int i = 0; i < attValList.size() - 1; i++) {
AttValTupleRegression attVal = attValList.get(i);
sumYPartition += attVal.m_sumY;
sumWeightPartition += attVal.m_sumWeight;
sumYRemaining -= attVal.m_sumY;
sumWeightRemaining -= attVal.m_sumWeight;
assert AbsIsSmallerEpsilon(sumWeightTotal - sumWeightRemaining - sumWeightPartition) : "The weights left and right of the split do not add up to the total weight.";
assert sumWeightPartition > 0.0 : "The weight of the partition is zero.";
assert sumWeightRemaining > 0.0 : "The weight of the remaining is zero.";
partitionMask = partitionMask.or(attVal.m_bitMask);
double gain;
boolean isValidSplit;
boolean tempMissingsGoLeft = true;
if (branchContainsMissingValues && useXGBoostMissingValueHandling) {
boolean isValidSplitPartitionWithMissing = sumWeightPartition + missingWeight >= minChildSize && sumWeightRemaining >= minChildSize;
double sumYMissingWithPartition = sumYPartition + missingY;
double gainMissingWithPartition = sumYMissingWithPartition * sumYMissingWithPartition / (sumWeightPartition + missingWeight) + sumYRemaining * sumYRemaining / sumWeightRemaining - criterionTotal;
boolean isValidSplitRemainingWithMissing = sumWeightPartition >= minChildSize && sumWeightRemaining + missingWeight >= minChildSize;
double sumYMissingWithRemaining = sumYRemaining + missingY;
double gainMissingWithRemaining = sumYPartition * sumYPartition / sumWeightPartition + sumYMissingWithRemaining * sumYMissingWithRemaining / (sumWeightRemaining + missingWeight) - criterionTotal;
if (gainMissingWithPartition >= gainMissingWithRemaining) {
gain = gainMissingWithPartition;
isValidSplit = isValidSplitPartitionWithMissing;
tempMissingsGoLeft = !partitionMask.testBit(highestBitPosition);
} else {
gain = gainMissingWithRemaining;
isValidSplit = isValidSplitRemainingWithMissing;
tempMissingsGoLeft = partitionMask.testBit(highestBitPosition);
}
} else {
isValidSplit = sumWeightPartition >= minChildSize && sumWeightRemaining >= minChildSize;
gain = sumYPartition * sumYPartition / sumWeightPartition + sumYRemaining * sumYRemaining / sumWeightRemaining - criterionTotal;
}
// use random tie breaker if gains are equal
boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
// store if better than before or first valid split
if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
if (isValidSplit || !isBestSplitValid) {
bestPartitionGain = gain;
// right branch must by convention always contain the nominal value
// with the highest assigned integer
bestPartitionMask = partitionMask.testBit(highestBitPosition) ? partitionMask : BigInteger.ZERO.setBit(highestBitPosition + 1).subtract(BigInteger.ONE).xor(partitionMask);
isBestSplitValid = isValidSplit;
if (branchContainsMissingValues) {
missingsGoLeft = tempMissingsGoLeft;
} else {
// no missings in this branch, but we still have to provide a direction for missing values
// send missings in the direction the most records in the node are sent to
boolean sendWithPartition = sumWeightPartition >= sumWeightRemaining;
missingsGoLeft = sendWithPartition ? !partitionMask.testBit(highestBitPosition) : partitionMask.testBit(highestBitPosition);
}
}
}
}
if (bestPartitionGain > 0.0 && isBestSplitValid) {
if (useXGBoostMissingValueHandling) {
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, NO_MISSED_ROWS, missingsGoLeft ? NominalBinarySplitCandidate.MISSINGS_GO_LEFT : NominalBinarySplitCandidate.MISSINGS_GO_RIGHT);
}
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
}
return null;
}
use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.
the class TreeNominalColumnData method calcBestSplitRegressionBinary.
private NominalBinarySplitCandidate calcBestSplitRegressionBinary(final ColumnMemberships columnMemberships, final RegressionPriors targetPriors, final TreeTargetNumericColumnData targetColumn, final NominalValueRepresentation[] nomVals, final RandomData rd) {
final int minChildSize = getConfiguration().getMinChildSize();
final double ySumTotal = targetPriors.getYSum();
final double nrRecordsTotal = targetPriors.getNrRecords();
final double criterionTotal = ySumTotal * ySumTotal / nrRecordsTotal;
final double[] ySums = new double[nomVals.length];
final double[] sumWeightsAttributes = new double[nomVals.length];
columnMemberships.next();
int start = 0;
for (int att = 0; att < nomVals.length; att++) {
int end = start + m_nominalValueCounts[att];
double weightSum = 0.0;
double ySum = 0.0;
boolean reachedEnd = false;
for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
final double weight = columnMemberships.getRowWeight();
assert weight > EPSILON : "Instances in columnMemberships must have weights larger than EPSILON.";
ySum += weight * targetColumn.getValueFor(columnMemberships.getOriginalIndex());
weightSum += weight;
if (!columnMemberships.next()) {
// reached end of columnMemberships
reachedEnd = true;
break;
}
}
sumWeightsAttributes[att] = weightSum;
ySums[att] = ySum;
start = end;
if (reachedEnd) {
break;
}
}
BinarySplitEnumeration splitEnumeration;
if (nomVals.length <= 10) {
splitEnumeration = new FullBinarySplitEnumeration(nomVals.length);
} else {
int maxSearch = (1 << 10 - 2);
splitEnumeration = new RandomBinarySplitEnumeration(nomVals.length, maxSearch, rd);
}
BigInteger bestPartitionMask = null;
boolean isBestSplitValid = false;
double bestPartitionGain = Double.NEGATIVE_INFINITY;
do {
double weightLeft = 0.0;
double ySumLeft = 0.0;
double weightRight = 0.0;
double ySumRight = 0.0;
for (int i = 0; i < nomVals.length; i++) {
final boolean isAttributeInRightBranch = splitEnumeration.isInRightBranch(i);
if (isAttributeInRightBranch) {
weightRight += sumWeightsAttributes[i];
ySumRight += ySums[i];
} else {
weightLeft += sumWeightsAttributes[i];
ySumLeft += ySums[i];
}
}
final boolean isValidSplit = weightRight >= minChildSize && weightLeft >= minChildSize;
double gain = ySumRight * ySumRight / weightRight + ySumLeft * ySumLeft / weightLeft - criterionTotal;
// use random tie breaker if gains are equal
boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
// store if better than before or first valid split
if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
if (isValidSplit || !isBestSplitValid) {
bestPartitionGain = gain;
bestPartitionMask = splitEnumeration.getValueMask();
isBestSplitValid = isValidSplit;
}
}
} while (splitEnumeration.next());
if (bestPartitionGain > 0.0) {
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
}
return null;
}
use of org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate in project knime-core by knime.
the class TreeNominalColumnDataTest method testCalcBestSplitRegressionBinaryXGBoostMissingValueHandling.
/**
* Tests the XGBoost missing value handling in case of a regression with binary splits.
*
* @throws Exception
*/
@Test
public void testCalcBestSplitRegressionBinaryXGBoostMissingValueHandling() throws Exception {
final TreeEnsembleLearnerConfiguration config = createConfig(true);
config.setMissingValueHandling(MissingValueHandling.XGBoost);
final TestDataGenerator dataGen = new TestDataGenerator(config);
final String noMissingCSV = "A, A, A, B, B, B, B, C, C";
final String noMissingsTarget = "1, 2, 2, 7, 6, 5, 2, 3, 1";
TreeNominalColumnData dataCol = dataGen.createNominalAttributeColumn(noMissingCSV, "noMissings", 0);
TreeTargetNumericColumnData targetCol = TestDataGenerator.createNumericTargetColumn(noMissingsTarget);
double[] weights = new double[9];
Arrays.fill(weights, 1.0);
int[] indices = new int[9];
for (int i = 0; i < indices.length; i++) {
indices[i] = i;
}
final RandomData rd = config.createRandomData();
DataMemberships dataMemberships = new MockDataColMem(indices, indices, weights);
// first test the case that there are no missing values during training (we still need to provide a missing value direction for prediction)
SplitCandidate split = dataCol.calcBestSplitRegression(dataMemberships, targetCol.getPriors(weights, config), targetCol, rd);
assertNotNull("SplitCandidate may not be null", split);
assertThat(split, instanceOf(NominalBinarySplitCandidate.class));
assertEquals("Wrong gain.", 22.755555, split.getGainValue(), 1e-5);
assertTrue("No missing values in dataCol therefore the missedRows BitSet must be empty.", split.getMissedRows().isEmpty());
NominalBinarySplitCandidate nomSplit = (NominalBinarySplitCandidate) split;
TreeNodeNominalBinaryCondition[] conditions = nomSplit.getChildConditions();
assertEquals("Binary split candidate must have two children.", 2, conditions.length);
final String[] values = new String[] { "A", "C" };
assertArrayEquals("Wrong values in split condition.", values, conditions[0].getValues());
assertArrayEquals("Wrong values in split condition.", values, conditions[1].getValues());
assertFalse("Missings should go with majority", conditions[0].acceptsMissings());
assertTrue("Missings should go with majority", conditions[1].acceptsMissings());
assertEquals("Wrong set logic.", SetLogic.IS_NOT_IN, conditions[0].getSetLogic());
assertEquals("Wrong set logic.", SetLogic.IS_IN, conditions[1].getSetLogic());
// test the case that there are missing values during training
final String missingCSV = "A, A, A, B, B, B, B, C, C, ?";
final String missingTarget = "1, 2, 2, 7, 6, 5, 2, 3, 1, 8";
dataCol = dataGen.createNominalAttributeColumn(missingCSV, "missing", 0);
targetCol = TestDataGenerator.createNumericTargetColumn(missingTarget);
weights = new double[10];
Arrays.fill(weights, 1.0);
indices = new int[10];
for (int i = 0; i < indices.length; i++) {
indices[i] = i;
}
dataMemberships = new MockDataColMem(indices, indices, weights);
split = dataCol.calcBestSplitRegression(dataMemberships, targetCol.getPriors(weights, config), targetCol, rd);
assertNotNull("SplitCandidate may not be null.", split);
assertThat(split, instanceOf(NominalBinarySplitCandidate.class));
assertEquals("Wrong gain.", 36.1, split.getGainValue(), 1e-5);
assertTrue("Conditions should handle missing values therefore the missedRows BitSet must be empty.", split.getMissedRows().isEmpty());
nomSplit = (NominalBinarySplitCandidate) split;
conditions = nomSplit.getChildConditions();
assertEquals("Binary split candidate must have two children.", 2, conditions.length);
assertArrayEquals("Wrong values in split condition.", values, conditions[0].getValues());
assertArrayEquals("Wrong values in split condition.", values, conditions[1].getValues());
assertTrue("Missings should go with B (because there target values are similar)", conditions[0].acceptsMissings());
assertFalse("Missings should go with B (because there target values are similar)", conditions[1].acceptsMissings());
assertEquals("Wrong set logic.", SetLogic.IS_NOT_IN, conditions[0].getSetLogic());
assertEquals("Wrong set logic.", SetLogic.IS_IN, conditions[1].getSetLogic());
}
Aggregations