Search in sources :

Example 21 with TreeEnsembleLearnerConfiguration

use of org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration in project knime-core by knime.

the class TreeLearnerClassification method findBestSplitClassification.

private SplitCandidate findBestSplitClassification(final int currentDepth, final DataMemberships dataMemberships, final ColumnSample columnSample, final TreeNodeSignature treeNodeSignature, final ClassificationPriors targetPriors, final BitSet forbiddenColumnSet) {
    final TreeData data = getData();
    final RandomData rd = getRandomData();
    // final ColumnSampleStrategy colSamplingStrategy = getColSamplingStrategy();
    final TreeEnsembleLearnerConfiguration config = getConfig();
    final int maxLevels = config.getMaxLevels();
    if (maxLevels != TreeEnsembleLearnerConfiguration.MAX_LEVEL_INFINITE && currentDepth >= maxLevels) {
        return null;
    }
    final int minNodeSize = config.getMinNodeSize();
    if (minNodeSize != TreeEnsembleLearnerConfiguration.MIN_NODE_SIZE_UNDEFINED) {
        if (targetPriors.getNrRecords() < minNodeSize) {
            return null;
        }
    }
    final double priorImpurity = targetPriors.getPriorImpurity();
    if (priorImpurity < TreeColumnData.EPSILON) {
        return null;
    }
    final TreeTargetNominalColumnData targetColumn = (TreeTargetNominalColumnData) data.getTargetColumn();
    SplitCandidate splitCandidate = null;
    if (currentDepth == 0 && config.getHardCodedRootColumn() != null) {
        final TreeAttributeColumnData rootColumn = data.getColumn(config.getHardCodedRootColumn());
        // TODO discuss whether this option makes sense with surrogates
        return rootColumn.calcBestSplitClassification(dataMemberships, targetPriors, targetColumn, rd);
    }
    double bestGainValue = 0.0;
    for (TreeAttributeColumnData col : columnSample) {
        if (forbiddenColumnSet.get(col.getMetaData().getAttributeIndex())) {
            continue;
        }
        final SplitCandidate currentColSplit = col.calcBestSplitClassification(dataMemberships, targetPriors, targetColumn, rd);
        if (currentColSplit != null) {
            final double currentGain = currentColSplit.getGainValue();
            final boolean tiebreaker = currentGain == bestGainValue ? (rd.nextInt(0, 1) == 0) : false;
            if (currentColSplit.getGainValue() > bestGainValue || tiebreaker) {
                splitCandidate = currentColSplit;
                bestGainValue = currentGain;
            }
        }
    }
    return splitCandidate;
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RandomData(org.apache.commons.math.random.RandomData) TreeAttributeColumnData(org.knime.base.node.mine.treeensemble2.data.TreeAttributeColumnData) TreeData(org.knime.base.node.mine.treeensemble2.data.TreeData) TreeTargetNominalColumnData(org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData)

Example 22 with TreeEnsembleLearnerConfiguration

use of org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration in project knime-core by knime.

the class TreeLearnerClassification method findBestSplitsClassification.

/**
 * Returns a list of SplitCandidates sorted (descending) by their gain
 *
 * @param currentDepth
 * @param rowSampleWeights
 * @param treeNodeSignature
 * @param targetPriors
 * @param forbiddenColumnSet
 * @param membershipController
 * @return
 */
private SplitCandidate[] findBestSplitsClassification(final int currentDepth, final DataMemberships dataMemberships, final ColumnSample columnSample, final TreeNodeSignature treeNodeSignature, final ClassificationPriors targetPriors, final BitSet forbiddenColumnSet) {
    final TreeData data = getData();
    final RandomData rd = getRandomData();
    // final ColumnSampleStrategy colSamplingStrategy = getColSamplingStrategy();
    final TreeEnsembleLearnerConfiguration config = getConfig();
    final int maxLevels = config.getMaxLevels();
    if (maxLevels != TreeEnsembleLearnerConfiguration.MAX_LEVEL_INFINITE && currentDepth >= maxLevels) {
        return null;
    }
    final int minNodeSize = config.getMinNodeSize();
    if (minNodeSize != TreeEnsembleLearnerConfiguration.MIN_NODE_SIZE_UNDEFINED) {
        if (targetPriors.getNrRecords() < minNodeSize) {
            return null;
        }
    }
    final double priorImpurity = targetPriors.getPriorImpurity();
    if (priorImpurity < TreeColumnData.EPSILON) {
        return null;
    }
    final TreeTargetNominalColumnData targetColumn = (TreeTargetNominalColumnData) data.getTargetColumn();
    SplitCandidate splitCandidate = null;
    if (currentDepth == 0 && config.getHardCodedRootColumn() != null) {
        final TreeAttributeColumnData rootColumn = data.getColumn(config.getHardCodedRootColumn());
        // TODO discuss whether this option makes sense with surrogates
        return new SplitCandidate[] { rootColumn.calcBestSplitClassification(dataMemberships, targetPriors, targetColumn, rd) };
    }
    double bestGainValue = 0.0;
    final Comparator<SplitCandidate> comp = new Comparator<SplitCandidate>() {

        @Override
        public int compare(final SplitCandidate o1, final SplitCandidate o2) {
            int compareDouble = -Double.compare(o1.getGainValue(), o2.getGainValue());
            return compareDouble;
        }
    };
    ArrayList<SplitCandidate> candidates = new ArrayList<SplitCandidate>(columnSample.getNumCols());
    for (TreeAttributeColumnData col : columnSample) {
        if (forbiddenColumnSet.get(col.getMetaData().getAttributeIndex())) {
            continue;
        }
        SplitCandidate currentColSplit = col.calcBestSplitClassification(dataMemberships, targetPriors, targetColumn, rd);
        if (currentColSplit != null) {
            candidates.add(currentColSplit);
        }
    }
    if (candidates.isEmpty()) {
        return null;
    }
    candidates.sort(comp);
    return candidates.toArray(new SplitCandidate[candidates.size()]);
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RandomData(org.apache.commons.math.random.RandomData) TreeAttributeColumnData(org.knime.base.node.mine.treeensemble2.data.TreeAttributeColumnData) ArrayList(java.util.ArrayList) TreeData(org.knime.base.node.mine.treeensemble2.data.TreeData) TreeTargetNominalColumnData(org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData) Comparator(java.util.Comparator)

Example 23 with TreeEnsembleLearnerConfiguration

use of org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration in project knime-core by knime.

the class TreeLearnerClassification method learnSingleTreeRecursive.

private TreeModelClassification learnSingleTreeRecursive(final ExecutionMonitor exec, final RandomData rd) throws CanceledExecutionException {
    final TreeData data = getData();
    final RowSample rowSampling = getRowSampling();
    final TreeEnsembleLearnerConfiguration config = getConfig();
    final TreeTargetNominalColumnData targetColumn = (TreeTargetNominalColumnData) data.getTargetColumn();
    final // new RootDataMem(rowSampling, getIndexManager());
    DataMemberships rootDataMemberships = new RootDataMemberships(rowSampling, data, getIndexManager());
    ClassificationPriors targetPriors = targetColumn.getDistribution(rootDataMemberships, config);
    BitSet forbiddenColumnSet = new BitSet(data.getNrAttributes());
    // final DataMemberships rootDataMemberships = new IntArrayDataMemberships(sampleWeights, data);
    final TreeNodeSignature rootSignature = TreeNodeSignature.ROOT_SIGNATURE;
    final ColumnSample rootColumnSample = getColSamplingStrategy().getColumnSampleForTreeNode(rootSignature);
    TreeNodeClassification rootNode = null;
    rootNode = buildTreeNode(exec, 0, rootDataMemberships, rootColumnSample, rootSignature, targetPriors, forbiddenColumnSet);
    assert forbiddenColumnSet.cardinality() == 0;
    rootNode.setTreeNodeCondition(TreeNodeTrueCondition.INSTANCE);
    return new TreeModelClassification(rootNode);
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) TreeNodeClassification(org.knime.base.node.mine.treeensemble2.model.TreeNodeClassification) ColumnSample(org.knime.base.node.mine.treeensemble2.sample.column.ColumnSample) BitSet(java.util.BitSet) TreeData(org.knime.base.node.mine.treeensemble2.data.TreeData) RowSample(org.knime.base.node.mine.treeensemble2.sample.row.RowSample) TreeNodeSignature(org.knime.base.node.mine.treeensemble2.model.TreeNodeSignature) TreeTargetNominalColumnData(org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData) ClassificationPriors(org.knime.base.node.mine.treeensemble2.data.ClassificationPriors) TreeModelClassification(org.knime.base.node.mine.treeensemble2.model.TreeModelClassification)

Example 24 with TreeEnsembleLearnerConfiguration

use of org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration in project knime-core by knime.

the class TreeNumericColumnDataTest method testCalcBestSplitClassification.

@Test
public void testCalcBestSplitClassification() throws Exception {
    TreeEnsembleLearnerConfiguration config = createConfig();
    /* data from J. Fuernkranz, Uni Darmstadt:
         * http://www.ke.tu-darmstadt.de/lehre/archiv/ws0809/mldm/dt.pdf */
    final double[] data = asDataArray("60,70,75,85, 90, 95, 100,120,125,220");
    final String[] target = asStringArray("No,No,No,Yes,Yes,Yes,No, No, No, No");
    Pair<TreeOrdinaryNumericColumnData, TreeTargetNominalColumnData> exampleData = exampleData(config, data, target);
    RandomData rd = config.createRandomData();
    TreeNumericColumnData columnData = exampleData.getFirst();
    TreeTargetNominalColumnData targetData = exampleData.getSecond();
    assertEquals(SplitCriterion.Gini, config.getSplitCriterion());
    double[] rowWeights = new double[data.length];
    Arrays.fill(rowWeights, 1.0);
    TreeData treeData = createTreeDataClassification(exampleData);
    IDataIndexManager indexManager = new DefaultDataIndexManager(treeData);
    DataMemberships dataMemberships = new RootDataMemberships(rowWeights, treeData, indexManager);
    ClassificationPriors priors = targetData.getDistribution(rowWeights, config);
    SplitCandidate splitCandidate = columnData.calcBestSplitClassification(dataMemberships, priors, targetData, rd);
    assertNotNull(splitCandidate);
    assertThat(splitCandidate, instanceOf(NumericSplitCandidate.class));
    assertTrue(splitCandidate.canColumnBeSplitFurther());
    // libre office calc
    assertEquals(/*0.42 - 0.300 */
    0.12, splitCandidate.getGainValue(), 0.00001);
    NumericSplitCandidate numSplitCandidate = (NumericSplitCandidate) splitCandidate;
    TreeNodeNumericCondition[] childConditions = numSplitCandidate.getChildConditions();
    assertEquals(2, childConditions.length);
    assertEquals((95.0 + 100.0) / 2.0, childConditions[0].getSplitValue(), 0.0);
    assertEquals((95.0 + 100.0) / 2.0, childConditions[1].getSplitValue(), 0.0);
    assertEquals(NumericOperator.LessThanOrEqual, childConditions[0].getNumericOperator());
    assertEquals(NumericOperator.LargerThan, childConditions[1].getNumericOperator());
    double[] childRowWeights = new double[data.length];
    System.arraycopy(rowWeights, 0, childRowWeights, 0, rowWeights.length);
    BitSet inChild = columnData.updateChildMemberships(childConditions[0], dataMemberships);
    DataMemberships childMemberships = dataMemberships.createChildMemberships(inChild);
    ClassificationPriors childTargetPriors = targetData.getDistribution(childMemberships, config);
    SplitCandidate splitCandidateChild = columnData.calcBestSplitClassification(childMemberships, childTargetPriors, targetData, rd);
    assertNotNull(splitCandidateChild);
    assertThat(splitCandidateChild, instanceOf(NumericSplitCandidate.class));
    // manually via libre office calc
    assertEquals(0.5, splitCandidateChild.getGainValue(), 0.00001);
    TreeNodeNumericCondition[] childConditions2 = ((NumericSplitCandidate) splitCandidateChild).getChildConditions();
    assertEquals(2, childConditions2.length);
    assertEquals((75.0 + 85.0) / 2.0, childConditions2[0].getSplitValue(), 0.0);
    System.arraycopy(rowWeights, 0, childRowWeights, 0, rowWeights.length);
    inChild = columnData.updateChildMemberships(childConditions[1], dataMemberships);
    childMemberships = dataMemberships.createChildMemberships(inChild);
    childTargetPriors = targetData.getDistribution(childMemberships, config);
    splitCandidateChild = columnData.calcBestSplitClassification(childMemberships, childTargetPriors, targetData, rd);
    assertNull(splitCandidateChild);
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) RandomData(org.apache.commons.math.random.RandomData) TreeNodeNumericCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNumericCondition) BitSet(java.util.BitSet) IDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager) NumericSplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NumericSplitCandidate) SplitCandidate(org.knime.base.node.mine.treeensemble2.learner.SplitCandidate) NumericMissingSplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NumericMissingSplitCandidate) DefaultDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager) DataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) NumericSplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NumericSplitCandidate) Test(org.junit.Test)

Example 25 with TreeEnsembleLearnerConfiguration

use of org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration in project knime-core by knime.

the class TreeNumericColumnDataTest method testUpdateChildMemberships.

/**
 * Tests the {@link TreeNumericColumnData#updateChildMemberships(TreeNodeCondition, DataMemberships)} methods with
 * different conditions including missing values.
 *
 * @throws Exception
 */
@Test
public void testUpdateChildMemberships() throws Exception {
    final TreeEnsembleLearnerConfiguration config = createConfig();
    final TestDataGenerator dataGen = new TestDataGenerator(config);
    final int[] indices = new int[] { 0, 1, 2, 3, 4, 5, 6 };
    final double[] weights = new double[7];
    Arrays.fill(weights, 1.0);
    final DataMemberships dataMem = new MockDataColMem(indices, indices, weights);
    final String noMissingsCSV = "-50, -3, -2, 2, 25, 100, 101";
    final TreeNumericColumnData col = dataGen.createNumericAttributeColumn(noMissingsCSV, "noMissings-col", 0);
    // less than or equals
    TreeNodeNumericCondition numCond = new TreeNodeNumericCondition(col.getMetaData(), -2, NumericOperator.LessThanOrEqual, false);
    BitSet inChild = col.updateChildMemberships(numCond, dataMem);
    BitSet expected = new BitSet(3);
    expected.set(0, 3);
    assertEquals("The produced BitSet is incorrect.", expected, inChild);
    // greater than
    numCond = new TreeNodeNumericCondition(col.getMetaData(), 10, NumericOperator.LargerThan, false);
    inChild = col.updateChildMemberships(numCond, dataMem);
    expected.clear();
    expected.set(4, 7);
    assertEquals("The produced BitSet is incorrect", expected, inChild);
    // with missing values
    final String missingsCSV = "-2, 0, 1, 43, 61, 66, NaN";
    final TreeNumericColumnData colWithMissings = dataGen.createNumericAttributeColumn(missingsCSV, "missings-col", 0);
    // less than or equal or missing
    numCond = new TreeNodeNumericCondition(colWithMissings.getMetaData(), 12, NumericOperator.LessThanOrEqual, true);
    inChild = colWithMissings.updateChildMemberships(numCond, dataMem);
    expected.clear();
    expected.set(0, 3);
    expected.set(6);
    assertEquals("The produced BitSet is incorrect", expected, inChild);
    // less than or equals not missing
    numCond = new TreeNodeNumericCondition(colWithMissings.getMetaData(), 12, NumericOperator.LessThanOrEqual, false);
    inChild = colWithMissings.updateChildMemberships(numCond, dataMem);
    expected.clear();
    expected.set(0, 3);
    assertEquals("The produced BitSet is incorrect", expected, inChild);
    // larger than or missing
    numCond = new TreeNodeNumericCondition(colWithMissings.getMetaData(), 43, NumericOperator.LargerThan, true);
    inChild = colWithMissings.updateChildMemberships(numCond, dataMem);
    expected.clear();
    expected.set(4, 7);
    assertEquals("The produced BitSet is incorrect", expected, inChild);
    // larger than not missing
    numCond = new TreeNodeNumericCondition(colWithMissings.getMetaData(), 12, NumericOperator.LargerThan, false);
    inChild = colWithMissings.updateChildMemberships(numCond, dataMem);
    expected.clear();
    expected.set(3, 6);
    assertEquals("The produced BitSet is incorrect", expected, inChild);
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) TreeNodeNumericCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNumericCondition) BitSet(java.util.BitSet) DataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) Test(org.junit.Test)

Aggregations

TreeEnsembleLearnerConfiguration (org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration)62 Test (org.junit.Test)29 DataMemberships (org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships)27 RootDataMemberships (org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships)26 SplitCandidate (org.knime.base.node.mine.treeensemble2.learner.SplitCandidate)19 RandomData (org.apache.commons.math.random.RandomData)17 BitSet (java.util.BitSet)16 DefaultDataIndexManager (org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager)15 NominalBinarySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)15 IDataIndexManager (org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager)13 NominalMultiwaySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate)13 TreeData (org.knime.base.node.mine.treeensemble2.data.TreeData)10 TreeNodeNominalBinaryCondition (org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition)10 TestDataGenerator (org.knime.base.node.mine.treeensemble2.data.TestDataGenerator)9 TreeAttributeColumnData (org.knime.base.node.mine.treeensemble2.data.TreeAttributeColumnData)8 NumericSplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NumericSplitCandidate)8 TreeNodeNumericCondition (org.knime.base.node.mine.treeensemble2.model.TreeNodeNumericCondition)7 NumericMissingSplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NumericMissingSplitCandidate)6 TreeNodeNominalCondition (org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalCondition)6 TreeTargetNominalColumnData (org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData)5