use of org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships in project knime-core by knime.
the class TreeNominalColumnData method calcBestSplitClassificationBinary.
NominalBinarySplitCandidate calcBestSplitClassificationBinary(final ColumnMemberships columnMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final IImpurity impCriterion, final NominalValueRepresentation[] nomVals, final NominalValueRepresentation[] targetVals, final RandomData rd) {
if (nomVals.length <= 1) {
return null;
}
final int minChildSize = getConfiguration().getMinChildSize();
final int lengthNonMissing = containsMissingValues() ? nomVals.length - 1 : nomVals.length;
// distribution of target for each attribute value
final double[][] targetCountsSplitPerAttribute = new double[lengthNonMissing][targetVals.length];
// number of valid records for each attribute value
final double[] attWeights = new double[lengthNonMissing];
// number (sum) of total valid values
double totalWeight = 0.0;
int start = 0;
columnMemberships.next();
for (int att = 0; att < lengthNonMissing; att++) {
final int end = start + m_nominalValueCounts[att];
double currentAttValWeight = 0.0;
for (int index = columnMemberships.getIndexInColumn(); index < end; columnMemberships.next(), index = columnMemberships.getIndexInColumn()) {
final double weight = columnMemberships.getRowWeight();
assert weight > EPSILON : "The usage of datamemberships should ensure that no rows with zero weight are encountered";
int target = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
targetCountsSplitPerAttribute[att][target] += weight;
currentAttValWeight += weight;
}
totalWeight += currentAttValWeight;
attWeights[att] = currentAttValWeight;
start = end;
}
BinarySplitEnumeration splitEnumeration;
if (nomVals.length <= 10) {
splitEnumeration = new FullBinarySplitEnumeration(nomVals.length);
} else {
int maxSearch = (1 << 10 - 2);
splitEnumeration = new RandomBinarySplitEnumeration(nomVals.length, maxSearch, rd);
}
BigInteger bestPartitionMask = null;
boolean isBestSplitValid = false;
double bestPartitionGain = Double.NEGATIVE_INFINITY;
final double[] targetCountsSplitLeft = new double[targetVals.length];
final double[] targetCountsSplitRight = new double[targetVals.length];
final double[] binaryImpurityValues = new double[2];
final double[] binaryPartitionWeights = new double[2];
do {
Arrays.fill(targetCountsSplitLeft, 0.0);
Arrays.fill(targetCountsSplitRight, 0.0);
double weightLeft = 0.0;
double weightRight = 0.0;
for (int i = 0; i < nomVals.length; i++) {
final boolean isAttributeInRightBranch = splitEnumeration.isInRightBranch(i);
double[] targetCountsCurrentAttribute = targetCountsSplitPerAttribute[i];
for (int targetVal = 0; targetVal < targetVals.length; targetVal++) {
if (isAttributeInRightBranch) {
targetCountsSplitRight[targetVal] += targetCountsCurrentAttribute[targetVal];
} else {
targetCountsSplitLeft[targetVal] += targetCountsCurrentAttribute[targetVal];
}
}
if (isAttributeInRightBranch) {
weightRight += attWeights[i];
} else {
weightLeft += attWeights[i];
}
}
binaryPartitionWeights[0] = weightRight;
binaryPartitionWeights[1] = weightLeft;
boolean isValidSplit = weightRight >= minChildSize && weightLeft >= minChildSize;
binaryImpurityValues[0] = impCriterion.getPartitionImpurity(targetCountsSplitRight, weightRight);
binaryImpurityValues[1] = impCriterion.getPartitionImpurity(targetCountsSplitLeft, weightLeft);
double postSplitImpurity = impCriterion.getPostSplitImpurity(binaryImpurityValues, binaryPartitionWeights, totalWeight);
double gain = impCriterion.getGain(targetPriors.getPriorImpurity(), postSplitImpurity, binaryPartitionWeights, totalWeight);
// use random tie breaker if gains are equal
boolean randomTieBreaker = gain == bestPartitionGain ? rd.nextInt(0, 1) == 1 : false;
// store if better than before or first valid split
if (gain > bestPartitionGain || (!isBestSplitValid && isValidSplit) || randomTieBreaker) {
if (isValidSplit || !isBestSplitValid) {
bestPartitionGain = gain;
bestPartitionMask = splitEnumeration.getValueMask();
isBestSplitValid = isValidSplit;
}
}
} while (splitEnumeration.next());
if (bestPartitionGain > 0.0) {
return new NominalBinarySplitCandidate(this, bestPartitionGain, bestPartitionMask, getMissedRows(columnMemberships), NominalBinarySplitCandidate.NO_MISSINGS);
}
return null;
}
use of org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships in project knime-core by knime.
the class TreeNominalColumnData method updateChildMembershipsMultiway.
private BitSet updateChildMembershipsMultiway(final TreeNodeNominalCondition nomCondition, final DataMemberships parentMemberships) {
String value = nomCondition.getValue();
int att = -1;
final NominalValueRepresentation[] reps = getMetaData().getValues();
for (final NominalValueRepresentation rep : reps) {
if (rep.getNominalValue().equals(value)) {
att = rep.getAssignedInteger();
break;
}
}
if (att == -1) {
throw new IllegalStateException("Unknown value: " + value);
}
ColumnMemberships columnMemberships = parentMemberships.getColumnMemberships(getMetaData().getAttributeIndex());
BitSet inChild = new BitSet(columnMemberships.size());
columnMemberships.reset();
int start = 0;
for (int a = 0; a < att; a++) {
start += m_nominalValueCounts[a];
}
// Make sure that we are using an index >= start
if (!columnMemberships.nextIndexFrom(start)) {
return inChild;
}
boolean reachedEnd = false;
int end = start + m_nominalValueCounts[att];
for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
inChild.set(columnMemberships.getIndexInDataMemberships());
if (!columnMemberships.next()) {
reachedEnd = true;
break;
}
}
if (!reachedEnd && containsMissingValues() && nomCondition.acceptsMissings()) {
// move to missing values
for (int i = att; i < reps.length - 1; i++) {
start += m_nominalValueCounts[i];
}
if (columnMemberships.nextIndexFrom(start)) {
do {
inChild.set(columnMemberships.getIndexInDataMemberships());
} while (columnMemberships.next());
}
}
return inChild;
}
use of org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships in project knime-core by knime.
the class TreeNominalColumnData method calcBestSplitRegression.
/**
* {@inheritDoc}
*/
@Override
public SplitCandidate calcBestSplitRegression(final DataMemberships dataMemberships, final RegressionPriors targetPriors, final TreeTargetNumericColumnData targetColumn, final RandomData rd) {
final NominalValueRepresentation[] nomVals = getMetaData().getValues();
final ColumnMemberships columnMemberships = dataMemberships.getColumnMemberships(getMetaData().getAttributeIndex());
final boolean useBinaryNominalSplits = getConfiguration().isUseBinaryNominalSplits();
if (useBinaryNominalSplits) {
return calcBestSplitRegressionBinaryBreiman(columnMemberships, targetPriors, targetColumn, nomVals, rd);
} else {
return calcBestSplitRegressionMultiway(columnMemberships, targetPriors, targetColumn, nomVals, rd);
}
}
use of org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships in project knime-core by knime.
the class TreeNominalColumnData method updateChildMembershipsBinary.
private BitSet updateChildMembershipsBinary(final TreeNodeNominalBinaryCondition childBinaryCondition, final DataMemberships parentMemberships) {
ColumnMemberships columnMemberships = parentMemberships.getColumnMemberships(getMetaData().getAttributeIndex());
columnMemberships.reset();
BitSet inChild = new BitSet(columnMemberships.size());
// TODO Check if this can be done more efficiently
NominalValueRepresentation[] reps = getMetaData().getValues();
int start = 0;
boolean reachedEnd = false;
final int lengthNonMissing = containsMissingValues() ? reps.length - 1 : reps.length;
for (int att = 0; att < lengthNonMissing; att++) {
if (childBinaryCondition.testCondition(att)) {
// move columnMemberships to correct position
if (!columnMemberships.nextIndexFrom(start)) {
// reached end of columnMemberships
break;
}
int end = start + m_nominalValueCounts[att];
for (int index = columnMemberships.getIndexInColumn(); index < end; index = columnMemberships.getIndexInColumn()) {
inChild.set(columnMemberships.getIndexInDataMemberships());
if (!columnMemberships.next()) {
reachedEnd = true;
break;
}
}
}
start += m_nominalValueCounts[att];
}
if (!reachedEnd && containsMissingValues() && childBinaryCondition.acceptsMissings()) {
if (columnMemberships.nextIndexFrom(start)) {
do {
inChild.set(columnMemberships.getIndexInDataMemberships());
} while (columnMemberships.next());
}
}
return inChild;
}
use of org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships in project knime-core by knime.
the class TreeNominalColumnData method calcBestSplitClassification.
/**
* {@inheritDoc}
*/
@Override
public SplitCandidate calcBestSplitClassification(final DataMemberships dataMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final RandomData rd) {
final NominalValueRepresentation[] targetVals = targetColumn.getMetaData().getValues();
IImpurity impCriterion = targetPriors.getImpurityCriterion();
// distribution of target for each attribute value
final NominalValueRepresentation[] nomVals = getMetaData().getValues();
final boolean useBinaryNominalSplits = getConfiguration().isUseBinaryNominalSplits();
final ColumnMemberships columnMemberships = dataMemberships.getColumnMemberships(getMetaData().getAttributeIndex());
if (useBinaryNominalSplits) {
if (targetVals.length == 2) {
return calcBestSplitClassificationBinaryTwoClass(columnMemberships, targetPriors, targetColumn, impCriterion, nomVals, targetVals, rd);
} else {
return calcBestSplitClassificationBinaryPCA(columnMemberships, targetPriors, targetColumn, impCriterion, nomVals, targetVals, rd);
// return calcBestSplitClassificationBinary(membershipController, rowWeights, targetPriors, targetColumn,
// impCriterion, nomVals, targetVals, originalIndexInColumnList, rd);
}
} else {
return calcBestSplitClassificationMultiway(columnMemberships, targetPriors, targetColumn, impCriterion, nomVals, targetVals, rd);
}
}
Aggregations