Search in sources :

Example 6 with TreeTargetNominalColumnData

use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData in project knime-core by knime.

the class TreeNominalColumnDataTest method createPCATestData.

private static Pair<TreeNominalColumnData, TreeTargetNominalColumnData> createPCATestData(final TreeEnsembleLearnerConfiguration config) {
    DataColumnSpec colSpec = new DataColumnSpecCreator("test-col", StringCell.TYPE).createSpec();
    final String[] attVals = new String[] { "A", "B", "C", "D", "E" };
    final String[] classes = new String[] { "T1", "T2", "T3" };
    TreeNominalColumnDataCreator colCreator = new TreeNominalColumnDataCreator(colSpec);
    DataColumnSpecCreator specCreator = new DataColumnSpecCreator("target-col", StringCell.TYPE);
    specCreator.setDomain(new DataColumnDomainCreator(Arrays.stream(classes).distinct().map(s -> new StringCell(s)).toArray(i -> new StringCell[i])).createDomain());
    DataColumnSpec targetSpec = specCreator.createSpec();
    TreeTargetColumnDataCreator targetCreator = new TreeTargetNominalColumnDataCreator(targetSpec);
    long rowKeyCounter = 0;
    final int[][] classDistributions = new int[][] { { 40, 10, 10 }, { 10, 40, 10 }, { 20, 30, 10 }, { 20, 15, 25 }, { 10, 5, 45 } };
    for (int i = 0; i < attVals.length; i++) {
        for (int j = 0; j < classes.length; j++) {
            for (int k = 0; k < classDistributions[i][j]; k++) {
                RowKey key = RowKey.createRowKey(rowKeyCounter++);
                colCreator.add(key, new StringCell(attVals[i]));
                targetCreator.add(key, new StringCell(classes[j]));
            }
        }
    }
    final TreeNominalColumnData testColData = colCreator.createColumnData(0, config);
    testColData.getMetaData().setAttributeIndex(0);
    return Pair.create(testColData, (TreeTargetNominalColumnData) targetCreator.createColumnData());
}
Also used : Arrays(java.util.Arrays) RandomData(org.apache.commons.math.random.RandomData) RowKey(org.knime.core.data.RowKey) IsInstanceOf.instanceOf(org.hamcrest.core.IsInstanceOf.instanceOf) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) SplitCriterion(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration.SplitCriterion) DataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships) TreeNodeNominalCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalCondition) Pair(org.knime.core.util.Pair) Assert.assertThat(org.junit.Assert.assertThat) ColumnSamplingMode(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration.ColumnSamplingMode) TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) DataColumnSpec(org.knime.core.data.DataColumnSpec) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) Assert.assertArrayEquals(org.junit.Assert.assertArrayEquals) NominalMultiwaySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate) SetLogic(org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition.SetLogic) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) BigInteger(java.math.BigInteger) TreeNodeNominalBinaryCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition) SplitCandidate(org.knime.base.node.mine.treeensemble2.learner.SplitCandidate) TreeType(org.knime.base.node.mine.treeensemble2.model.AbstractTreeEnsembleModel.TreeType) Assert.assertNotNull(org.junit.Assert.assertNotNull) IDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) DoubleCell(org.knime.core.data.def.DoubleCell) DefaultDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager) Assert.assertNull(org.junit.Assert.assertNull) Assert.assertFalse(org.junit.Assert.assertFalse) StringCell(org.knime.core.data.def.StringCell) BitSet(java.util.BitSet) MissingValueHandling(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration.MissingValueHandling) Assert.assertEquals(org.junit.Assert.assertEquals) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) RowKey(org.knime.core.data.RowKey) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator) DataColumnSpec(org.knime.core.data.DataColumnSpec) StringCell(org.knime.core.data.def.StringCell)

Example 7 with TreeTargetNominalColumnData

use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData in project knime-core by knime.

the class TreeNominalColumnDataTest method testCalcBestSplitClassificationMultiWay.

/**
 * Tests the method
 * {@link TreeNominalColumnData#calcBestSplitClassification(DataMemberships, ClassificationPriors, TreeTargetNominalColumnData, RandomData)}
 * using multiway splits
 *
 * @throws Exception
 */
@Test
public void testCalcBestSplitClassificationMultiWay() throws Exception {
    TreeEnsembleLearnerConfiguration config = createConfig(false);
    config.setUseBinaryNominalSplits(false);
    Pair<TreeNominalColumnData, TreeTargetNominalColumnData> tennisData = tennisData(config);
    TreeNominalColumnData columnData = tennisData.getFirst();
    TreeTargetNominalColumnData targetData = tennisData.getSecond();
    TreeData treeData = createTreeData(tennisData);
    assertEquals(SplitCriterion.Gini, config.getSplitCriterion());
    double[] rowWeights = new double[SMALL_COLUMN_DATA.length];
    Arrays.fill(rowWeights, 1.0);
    IDataIndexManager indexManager = new DefaultDataIndexManager(treeData);
    DataMemberships dataMemberships = new RootDataMemberships(rowWeights, treeData, indexManager);
    ClassificationPriors priors = targetData.getDistribution(rowWeights, config);
    SplitCandidate splitCandidate = columnData.calcBestSplitClassification(dataMemberships, priors, targetData, null);
    assertNotNull(splitCandidate);
    assertThat(splitCandidate, instanceOf(NominalMultiwaySplitCandidate.class));
    assertFalse(splitCandidate.canColumnBeSplitFurther());
    // manually via libre office calc
    assertEquals(0.0744897959, splitCandidate.getGainValue(), 0.00001);
    NominalMultiwaySplitCandidate multiWaySplitCandidate = (NominalMultiwaySplitCandidate) splitCandidate;
    TreeNodeNominalCondition[] childConditions = multiWaySplitCandidate.getChildConditions();
    assertEquals(3, childConditions.length);
    assertEquals("S", childConditions[0].getValue());
    assertEquals("O", childConditions[1].getValue());
    assertEquals("R", childConditions[2].getValue());
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) TreeNodeNominalCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalCondition) IDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager) NominalMultiwaySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) SplitCandidate(org.knime.base.node.mine.treeensemble2.learner.SplitCandidate) DefaultDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager) DataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) NominalMultiwaySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate) Test(org.junit.Test)

Example 8 with TreeTargetNominalColumnData

use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData in project knime-core by knime.

the class TreeNominalColumnDataTest method testCalcBestSplitCassificationBinaryTwoClassXGBoostMissingValue.

/**
 * Tests the XGBoost Missing value handling in case of a two class problem <br>
 * currently not tested because missing value handling will probably be implemented differently.
 *
 * @throws Exception
 */
// @Test
public void testCalcBestSplitCassificationBinaryTwoClassXGBoostMissingValue() throws Exception {
    final TreeEnsembleLearnerConfiguration config = createConfig(false);
    config.setMissingValueHandling(MissingValueHandling.XGBoost);
    final TestDataGenerator dataGen = new TestDataGenerator(config);
    // check correct behavior if no missing values are encountered during split search
    Pair<TreeNominalColumnData, TreeTargetNominalColumnData> twoClassTennisData = twoClassTennisData(config);
    TreeData treeData = dataGen.createTreeData(twoClassTennisData.getSecond(), twoClassTennisData.getFirst());
    IDataIndexManager indexManager = new DefaultDataIndexManager(treeData);
    double[] rowWeights = new double[TWO_CLASS_INDICES.length];
    Arrays.fill(rowWeights, 1.0);
    // DataMemberships dataMemberships = TestDataGenerator.createMockDataMemberships(TWO_CLASS_INDICES.length);
    DataMemberships dataMemberships = new RootDataMemberships(rowWeights, treeData, indexManager);
    TreeTargetNominalColumnData targetData = twoClassTennisData.getSecond();
    TreeNominalColumnData columnData = twoClassTennisData.getFirst();
    ClassificationPriors priors = targetData.getDistribution(rowWeights, config);
    RandomData rd = TestDataGenerator.createRandomData();
    SplitCandidate splitCandidate = columnData.calcBestSplitClassification(dataMemberships, priors, targetData, rd);
    assertNotNull(splitCandidate);
    assertThat(splitCandidate, instanceOf(NominalBinarySplitCandidate.class));
    NominalBinarySplitCandidate binarySplitCandidate = (NominalBinarySplitCandidate) splitCandidate;
    TreeNodeNominalBinaryCondition[] childConditions = binarySplitCandidate.getChildConditions();
    assertEquals(2, childConditions.length);
    assertArrayEquals(new String[] { "R" }, childConditions[0].getValues());
    assertArrayEquals(new String[] { "R" }, childConditions[1].getValues());
    assertEquals(SetLogic.IS_NOT_IN, childConditions[0].getSetLogic());
    assertEquals(SetLogic.IS_IN, childConditions[1].getSetLogic());
    // check if missing values go left
    assertTrue(childConditions[0].acceptsMissings());
    assertFalse(childConditions[1].acceptsMissings());
    // check correct behavior if missing values are encountered during split search
    String dataContainingMissingsCSV = "S,?,O,R,S,R,S,O,O,?";
    columnData = dataGen.createNominalAttributeColumn(dataContainingMissingsCSV, "column containing missing values", 0);
    treeData = dataGen.createTreeData(targetData, columnData);
    indexManager = new DefaultDataIndexManager(treeData);
    dataMemberships = new RootDataMemberships(rowWeights, treeData, indexManager);
    splitCandidate = columnData.calcBestSplitClassification(dataMemberships, priors, targetData, null);
    assertNotNull(splitCandidate);
    binarySplitCandidate = (NominalBinarySplitCandidate) splitCandidate;
    assertEquals("Gain was not as expected", 0.08, binarySplitCandidate.getGainValue(), 1e-8);
    childConditions = binarySplitCandidate.getChildConditions();
    String[] conditionValues = new String[] { "O", "?" };
    assertArrayEquals("Values in nominal condition did not match", conditionValues, childConditions[0].getValues());
    assertArrayEquals("Values in nominal condition did not match", conditionValues, childConditions[1].getValues());
    assertEquals("Wrong set logic.", SetLogic.IS_NOT_IN, childConditions[0].getSetLogic());
    assertEquals("Wrong set logic.", SetLogic.IS_IN, childConditions[1].getSetLogic());
    assertFalse("Missig values are not sent to the correct child.", childConditions[0].acceptsMissings());
    assertTrue("Missig values are not sent to the correct child.", childConditions[1].acceptsMissings());
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) RandomData(org.apache.commons.math.random.RandomData) IDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager) NominalMultiwaySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate) SplitCandidate(org.knime.base.node.mine.treeensemble2.learner.SplitCandidate) DefaultDataIndexManager(org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager) DataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships) RootDataMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships) TreeNodeNominalBinaryCondition(org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition) NominalBinarySplitCandidate(org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)

Example 9 with TreeTargetNominalColumnData

use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData in project knime-core by knime.

the class TreeBitVectorColumnData method calcBestSplitClassification.

/**
 * {@inheritDoc}
 */
@Override
public SplitCandidate calcBestSplitClassification(final DataMemberships dataMemberships, final ClassificationPriors targetPriors, final TreeTargetNominalColumnData targetColumn, final RandomData rd) {
    final NominalValueRepresentation[] targetVals = targetColumn.getMetaData().getValues();
    final IImpurity impurityCriterion = targetPriors.getImpurityCriterion();
    final int minChildSize = getConfiguration().getMinChildSize();
    // distribution of target for On ('1') and Off ('0') bits
    final double[] onTargetWeights = new double[targetVals.length];
    final double[] offTargetWeights = new double[targetVals.length];
    double onWeights = 0.0;
    double offWeights = 0.0;
    final ColumnMemberships columnMemberships = dataMemberships.getColumnMemberships(getMetaData().getAttributeIndex());
    while (columnMemberships.next()) {
        final double weight = columnMemberships.getRowWeight();
        if (weight < EPSILON) {
            // ignore record: not in current branch or not in sample
            assert false : "This code should never be reached!";
        } else {
            final int target = targetColumn.getValueFor(columnMemberships.getOriginalIndex());
            if (m_columnBitSet.get(columnMemberships.getIndexInColumn())) {
                onWeights += weight;
                onTargetWeights[target] += weight;
            } else {
                offWeights += weight;
                offTargetWeights[target] += weight;
            }
        }
    }
    if (onWeights < minChildSize || offWeights < minChildSize) {
        return null;
    }
    final double weightSum = onWeights + offWeights;
    final double onImpurity = impurityCriterion.getPartitionImpurity(onTargetWeights, onWeights);
    final double offImpurity = impurityCriterion.getPartitionImpurity(offTargetWeights, offWeights);
    final double[] partitionWeights = new double[] { onWeights, offWeights };
    final double postSplitImpurity = impurityCriterion.getPostSplitImpurity(new double[] { onImpurity, offImpurity }, partitionWeights, weightSum);
    final double gainValue = impurityCriterion.getGain(targetPriors.getPriorImpurity(), postSplitImpurity, partitionWeights, weightSum);
    return new BitSplitCandidate(this, gainValue);
}
Also used : BitSplitCandidate(org.knime.base.node.mine.treeensemble2.learner.BitSplitCandidate) ColumnMemberships(org.knime.base.node.mine.treeensemble2.data.memberships.ColumnMemberships) IImpurity(org.knime.base.node.mine.treeensemble2.learner.IImpurity)

Example 10 with TreeTargetNominalColumnData

use of org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData in project knime-core by knime.

the class TreeLearnerClassification method findBestSplitClassification.

private SplitCandidate findBestSplitClassification(final int currentDepth, final DataMemberships dataMemberships, final ColumnSample columnSample, final TreeNodeSignature treeNodeSignature, final ClassificationPriors targetPriors, final BitSet forbiddenColumnSet) {
    final TreeData data = getData();
    final RandomData rd = getRandomData();
    // final ColumnSampleStrategy colSamplingStrategy = getColSamplingStrategy();
    final TreeEnsembleLearnerConfiguration config = getConfig();
    final int maxLevels = config.getMaxLevels();
    if (maxLevels != TreeEnsembleLearnerConfiguration.MAX_LEVEL_INFINITE && currentDepth >= maxLevels) {
        return null;
    }
    final int minNodeSize = config.getMinNodeSize();
    if (minNodeSize != TreeEnsembleLearnerConfiguration.MIN_NODE_SIZE_UNDEFINED) {
        if (targetPriors.getNrRecords() < minNodeSize) {
            return null;
        }
    }
    final double priorImpurity = targetPriors.getPriorImpurity();
    if (priorImpurity < TreeColumnData.EPSILON) {
        return null;
    }
    final TreeTargetNominalColumnData targetColumn = (TreeTargetNominalColumnData) data.getTargetColumn();
    SplitCandidate splitCandidate = null;
    if (currentDepth == 0 && config.getHardCodedRootColumn() != null) {
        final TreeAttributeColumnData rootColumn = data.getColumn(config.getHardCodedRootColumn());
        // TODO discuss whether this option makes sense with surrogates
        return rootColumn.calcBestSplitClassification(dataMemberships, targetPriors, targetColumn, rd);
    }
    double bestGainValue = 0.0;
    for (TreeAttributeColumnData col : columnSample) {
        if (forbiddenColumnSet.get(col.getMetaData().getAttributeIndex())) {
            continue;
        }
        final SplitCandidate currentColSplit = col.calcBestSplitClassification(dataMemberships, targetPriors, targetColumn, rd);
        if (currentColSplit != null) {
            final double currentGain = currentColSplit.getGainValue();
            final boolean tiebreaker = currentGain == bestGainValue ? (rd.nextInt(0, 1) == 0) : false;
            if (currentColSplit.getGainValue() > bestGainValue || tiebreaker) {
                splitCandidate = currentColSplit;
                bestGainValue = currentGain;
            }
        }
    }
    return splitCandidate;
}
Also used : TreeEnsembleLearnerConfiguration(org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration) RandomData(org.apache.commons.math.random.RandomData) TreeAttributeColumnData(org.knime.base.node.mine.treeensemble2.data.TreeAttributeColumnData) TreeData(org.knime.base.node.mine.treeensemble2.data.TreeData) TreeTargetNominalColumnData(org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData)

Aggregations

TreeEnsembleLearnerConfiguration (org.knime.base.node.mine.treeensemble2.node.learner.TreeEnsembleLearnerConfiguration)23 DataMemberships (org.knime.base.node.mine.treeensemble2.data.memberships.DataMemberships)16 RootDataMemberships (org.knime.base.node.mine.treeensemble2.data.memberships.RootDataMemberships)16 SplitCandidate (org.knime.base.node.mine.treeensemble2.learner.SplitCandidate)14 RandomData (org.apache.commons.math.random.RandomData)13 Test (org.junit.Test)13 NominalBinarySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalBinarySplitCandidate)12 DefaultDataIndexManager (org.knime.base.node.mine.treeensemble2.data.memberships.DefaultDataIndexManager)11 IDataIndexManager (org.knime.base.node.mine.treeensemble2.data.memberships.IDataIndexManager)10 NominalMultiwaySplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NominalMultiwaySplitCandidate)9 BitSet (java.util.BitSet)8 TreeTargetNominalColumnData (org.knime.base.node.mine.treeensemble2.data.TreeTargetNominalColumnData)7 TreeNodeNominalBinaryCondition (org.knime.base.node.mine.treeensemble2.model.TreeNodeNominalBinaryCondition)7 TreeData (org.knime.base.node.mine.treeensemble2.data.TreeData)6 NumericSplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NumericSplitCandidate)6 NumericMissingSplitCandidate (org.knime.base.node.mine.treeensemble2.learner.NumericMissingSplitCandidate)5 TreeNodeNumericCondition (org.knime.base.node.mine.treeensemble2.model.TreeNodeNumericCondition)5 BigInteger (java.math.BigInteger)4 ArrayList (java.util.ArrayList)4 TreeAttributeColumnData (org.knime.base.node.mine.treeensemble2.data.TreeAttributeColumnData)4