use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNumericColumnData in project knime-core by knime.
the class TreeNominalColumnDataTest method testCalcBestSplitRegressionBinary.
/**
* Tests the method
* {@link TreeNominalColumnData#calcBestSplitRegression(DataMemberships, RegressionPriors, TreeTargetNumericColumnData, RandomData)}
* using binary splits
*
* @throws Exception
*/
@Test
public void testCalcBestSplitRegressionBinary() throws Exception {
TreeEnsembleLearnerConfiguration config = new TreeEnsembleLearnerConfiguration(true);
Pair<TreeNominalColumnData, TreeTargetNumericColumnData> tennisDataRegression = tennisDataRegression(config);
TreeNominalColumnData columnData = tennisDataRegression.getFirst();
TreeTargetNumericColumnData targetData = tennisDataRegression.getSecond();
TreeData treeData = createTreeDataRegression(tennisDataRegression);
double[] rowWeights = new double[SMALL_COLUMN_DATA.length];
Arrays.fill(rowWeights, 1.0);
IDataIndexManager indexManager = new DefaultDataIndexManager(treeData);
DataMemberships dataMemberships = new RootDataMemberships(rowWeights, treeData, indexManager);
RegressionPriors priors = targetData.getPriors(rowWeights, config);
SplitCandidate splitCandidate = columnData.calcBestSplitRegression(dataMemberships, priors, targetData, null);
assertNotNull(splitCandidate);
assertThat(splitCandidate, instanceOf(NominalBinarySplitCandidate.class));
assertTrue(splitCandidate.canColumnBeSplitFurther());
assertEquals(32.9143, splitCandidate.getGainValue(), 0.0001);
NominalBinarySplitCandidate binarySplitCandidate = (NominalBinarySplitCandidate) splitCandidate;
TreeNodeNominalBinaryCondition[] childConditions = binarySplitCandidate.getChildConditions();
assertEquals(2, childConditions.length);
assertArrayEquals(new String[] { "R" }, childConditions[0].getValues());
assertArrayEquals(new String[] { "R" }, childConditions[1].getValues());
assertEquals(SetLogic.IS_NOT_IN, childConditions[0].getSetLogic());
assertEquals(SetLogic.IS_IN, childConditions[1].getSetLogic());
}
use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNumericColumnData in project knime-core by knime.
the class TreeNumericColumnData method calcBestSplitRegression.
@Override
public SplitCandidate calcBestSplitRegression(final DataMemberships dataMemberships, final RegressionPriors targetPriors, final TreeTargetNumericColumnData targetColumn, final RandomData rd) {
final TreeEnsembleLearnerConfiguration config = getConfiguration();
final boolean useAverageSplitPoints = config.isUseAverageSplitPoints();
final int minChildNodeSize = config.getMinChildSize();
// get columnMemberships
final ColumnMemberships columnMemberships = dataMemberships.getColumnMemberships(getMetaData().getAttributeIndex());
final int lengthNonMissing = getLengthNonMissing();
// missing value handling
final boolean useXGBoostMissingValueHandling = config.getMissingValueHandling() == MissingValueHandling.XGBoost;
// are there missing values in this column (complete column)
boolean branchContainsMissingValues = containsMissingValues();
boolean missingsGoLeft = true;
double missingWeight = 0.0;
double missingY = 0.0;
// check if there are missing values in this rowsample
if (branchContainsMissingValues) {
columnMemberships.goToLast();
while (columnMemberships.getIndexInColumn() >= lengthNonMissing) {
missingWeight += columnMemberships.getRowWeight();
missingY += targetColumn.getValueFor(columnMemberships.getOriginalIndex());
if (!columnMemberships.previous()) {
break;
}
}
columnMemberships.reset();
branchContainsMissingValues = missingWeight > 0.0;
}
final double ySumTotal = targetPriors.getYSum() - missingY;
final double nrRecordsTotal = targetPriors.getNrRecords() - missingWeight;
final double criterionTotal = useXGBoostMissingValueHandling ? (ySumTotal + missingY) * (ySumTotal + missingY) / (nrRecordsTotal + missingWeight) : ySumTotal * ySumTotal / nrRecordsTotal;
double ySumLeft = 0.0;
double nrRecordsLeft = 0.0;
double ySumRight = ySumTotal;
double nrRecordsRight = nrRecordsTotal;
// all values in the current branch are missing
if (nrRecordsRight == 0) {
// it is impossible to determine a split
return null;
}
double bestSplit = Double.NEGATIVE_INFINITY;
double bestImprovement = 0.0;
double lastSeenY = Double.NaN;
double lastSeenValue = Double.NEGATIVE_INFINITY;
double lastSeenWeight = -1.0;
// compute the gain, keep the one that maximizes the split
while (columnMemberships.next()) {
final double weight = columnMemberships.getRowWeight();
if (weight < EPSILON) {
// ignore record: not in current branch or not in sample
continue;
} else if (Math.floor(weight) != weight) {
throw new UnsupportedOperationException("weighted records (missing values?) not supported, " + "weight is " + weight);
}
final double value = getSorted(columnMemberships.getIndexInColumn());
if (lastSeenWeight > 0.0) {
ySumLeft += lastSeenWeight * lastSeenY;
ySumRight -= lastSeenWeight * lastSeenY;
nrRecordsLeft += lastSeenWeight;
nrRecordsRight -= lastSeenWeight;
if (nrRecordsLeft >= minChildNodeSize && nrRecordsRight >= minChildNodeSize && lastSeenValue < value) {
boolean tempMissingsGoLeft = true;
double childrenSquaredSum;
if (branchContainsMissingValues && useXGBoostMissingValueHandling) {
final double[] tempChildrenSquaredSum = new double[2];
tempChildrenSquaredSum[0] = ((ySumLeft + missingY) * (ySumLeft + missingY) / (nrRecordsLeft + missingWeight)) + (ySumRight * ySumRight / nrRecordsRight);
tempChildrenSquaredSum[1] = (ySumLeft * ySumLeft / nrRecordsLeft) + ((ySumRight + missingY) * (ySumRight + missingY) / (nrRecordsRight + missingWeight));
if (tempChildrenSquaredSum[0] >= tempChildrenSquaredSum[1]) {
childrenSquaredSum = tempChildrenSquaredSum[0];
tempMissingsGoLeft = true;
} else {
childrenSquaredSum = tempChildrenSquaredSum[1];
tempMissingsGoLeft = false;
}
} else {
childrenSquaredSum = (ySumLeft * ySumLeft / nrRecordsLeft) + (ySumRight * ySumRight / nrRecordsRight);
}
double criterion = childrenSquaredSum - criterionTotal;
boolean randomTieBreaker = criterion == bestImprovement ? rd.nextInt(0, 1) == 1 : false;
if (criterion > bestImprovement || randomTieBreaker) {
bestImprovement = criterion;
bestSplit = useAverageSplitPoints ? getCenter(lastSeenValue, value) : lastSeenValue;
// if there are no missing values go with majority
missingsGoLeft = branchContainsMissingValues ? tempMissingsGoLeft : nrRecordsLeft >= nrRecordsRight;
}
}
}
lastSeenY = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
lastSeenValue = value;
lastSeenWeight = weight;
}
// + " but was " + lastSeenY * lastSeenWeight;
if (bestImprovement > 0.0) {
if (useXGBoostMissingValueHandling) {
// return new NumericMissingSplitCandidate(this, bestSplit, bestImprovement, missingsGoLeft);
return new NumericSplitCandidate(this, bestSplit, bestImprovement, new BitSet(), missingsGoLeft ? NumericSplitCandidate.MISSINGS_GO_LEFT : NumericSplitCandidate.MISSINGS_GO_RIGHT);
}
return new NumericSplitCandidate(this, bestSplit, bestImprovement, getMissedRows(columnMemberships), NumericSplitCandidate.NO_MISSINGS);
} else {
return null;
}
}
use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNumericColumnData in project knime-core by knime.
the class TreeLearnerRegression method buildTreeNode.
private TreeNodeRegression buildTreeNode(final ExecutionMonitor exec, final int currentDepth, final DataMemberships dataMemberships, final ColumnSample columnSample, final TreeNodeSignature treeNodeSignature, final RegressionPriors targetPriors, final BitSet forbiddenColumnSet) throws CanceledExecutionException {
final TreeData data = getData();
final RandomData rd = getRandomData();
final TreeEnsembleLearnerConfiguration config = getConfig();
exec.checkCanceled();
final SplitCandidate candidate = findBestSplitRegression(currentDepth, dataMemberships, columnSample, targetPriors, forbiddenColumnSet);
if (candidate == null) {
if (config instanceof GradientBoostingLearnerConfiguration) {
TreeNodeRegression leaf = new TreeNodeRegression(treeNodeSignature, targetPriors, dataMemberships.getOriginalIndices());
addToLeafList(leaf);
return leaf;
}
return new TreeNodeRegression(treeNodeSignature, targetPriors);
}
final TreeTargetNumericColumnData targetColumn = (TreeTargetNumericColumnData) data.getTargetColumn();
boolean useSurrogates = config.getMissingValueHandling() == MissingValueHandling.Surrogate;
TreeNodeCondition[] childConditions;
TreeNodeRegression[] childNodes;
if (useSurrogates) {
SurrogateSplit surrogateSplit = Surrogates.learnSurrogates(dataMemberships, candidate, data, columnSample, config, rd);
childConditions = surrogateSplit.getChildConditions();
BitSet[] childMarkers = surrogateSplit.getChildMarkers();
assert childMarkers[0].cardinality() + childMarkers[1].cardinality() == dataMemberships.getRowCount() : "Sum of rows in children does not add up to number of rows in parent.";
childNodes = new TreeNodeRegression[2];
for (int i = 0; i < 2; i++) {
DataMemberships childMemberships = dataMemberships.createChildMemberships(childMarkers[i]);
TreeNodeSignature childSignature = getSignatureFactory().getChildSignatureFor(treeNodeSignature, (byte) i);
ColumnSample childColumnSample = getColSamplingStrategy().getColumnSampleForTreeNode(childSignature);
RegressionPriors childTargetPriors = targetColumn.getPriors(childMemberships, config);
childNodes[i] = buildTreeNode(exec, currentDepth + 1, childMemberships, childColumnSample, childSignature, childTargetPriors, forbiddenColumnSet);
childNodes[i].setTreeNodeCondition(childConditions[i]);
}
} else {
SplitCandidate bestSplit = candidate;
TreeAttributeColumnData splitColumn = bestSplit.getColumnData();
final int attributeIndex = splitColumn.getMetaData().getAttributeIndex();
boolean markAttributeAsForbidden = !bestSplit.canColumnBeSplitFurther();
forbiddenColumnSet.set(attributeIndex, markAttributeAsForbidden);
childConditions = bestSplit.getChildConditions();
if (childConditions.length > Short.MAX_VALUE) {
throw new RuntimeException("Too many children when splitting " + "attribute " + bestSplit.getColumnData() + " (maximum supported: " + Short.MAX_VALUE + "): " + childConditions.length);
}
childNodes = new TreeNodeRegression[childConditions.length];
for (int i = 0; i < childConditions.length; i++) {
TreeNodeCondition cond = childConditions[i];
DataMemberships childMemberships = dataMemberships.createChildMemberships(splitColumn.updateChildMemberships(cond, dataMemberships));
RegressionPriors childTargetPriors = targetColumn.getPriors(childMemberships, config);
TreeNodeSignature childSignature = treeNodeSignature.createChildSignature((byte) i);
ColumnSample childColumnSample = getColSamplingStrategy().getColumnSampleForTreeNode(childSignature);
childNodes[i] = buildTreeNode(exec, currentDepth + 1, childMemberships, childColumnSample, childSignature, childTargetPriors, forbiddenColumnSet);
childNodes[i].setTreeNodeCondition(cond);
}
if (markAttributeAsForbidden) {
forbiddenColumnSet.set(attributeIndex, false);
}
}
return new TreeNodeRegression(treeNodeSignature, targetPriors, childNodes);
}
use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNumericColumnData in project knime-core by knime.
the class TreeLearnerRegression method findBestSplitRegression.
private SplitCandidate findBestSplitRegression(final int currentDepth, final DataMemberships dataMemberships, final ColumnSample columnSample, final RegressionPriors targetPriors, final BitSet forbiddenColumnSet) {
final TreeData data = getData();
final RandomData rd = getRandomData();
final TreeEnsembleLearnerConfiguration config = getConfig();
final int maxLevels = config.getMaxLevels();
if (maxLevels != TreeEnsembleLearnerConfiguration.MAX_LEVEL_INFINITE && currentDepth >= maxLevels) {
return null;
}
final int minNodeSize = config.getMinNodeSize();
if (minNodeSize != TreeEnsembleLearnerConfiguration.MIN_NODE_SIZE_UNDEFINED) {
if (targetPriors.getNrRecords() < minNodeSize) {
return null;
}
}
final double priorSquaredDeviation = targetPriors.getSumSquaredDeviation();
if (priorSquaredDeviation < TreeColumnData.EPSILON) {
return null;
}
final TreeTargetNumericColumnData targetColumn = getTargetData();
SplitCandidate splitCandidate = null;
if (currentDepth == 0 && config.getHardCodedRootColumn() != null) {
final TreeAttributeColumnData rootColumn = data.getColumn(config.getHardCodedRootColumn());
return rootColumn.calcBestSplitRegression(dataMemberships, targetPriors, targetColumn, rd);
} else {
double bestGainValue = 0.0;
for (TreeAttributeColumnData col : columnSample) {
if (forbiddenColumnSet.get(col.getMetaData().getAttributeIndex())) {
continue;
}
SplitCandidate currentColSplit = col.calcBestSplitRegression(dataMemberships, targetPriors, targetColumn, rd);
if (currentColSplit != null) {
double gainValue = currentColSplit.getGainValue();
if (gainValue > bestGainValue) {
bestGainValue = gainValue;
splitCandidate = currentColSplit;
}
}
}
return splitCandidate;
}
}
Aggregations