Search in sources :

Example 1 with MissingCell

use of org.knime.core.data.MissingCell in project knime-core by knime.

the class PMMLRuleSetPredictorNodeModel method createRearranger.

/**
 * Constructs the {@link ColumnRearranger} for computing the new columns.
 *
 * @param obj The {@link PMMLPortObject} of the preprocessing model.
 * @param spec The {@link DataTableSpec} of the table.
 * @param replaceColumn Should replace the {@code outputColumnName}?
 * @param outputColumnName The output column name (which might be an existing).
 * @param addConfidence Should add the confidence values to a column?
 * @param confidenceColumnName The name of the confidence column.
 * @param validationColumnIdx Index of the validation column, {@code -1} if not specified.
 * @param processConcurrently Should be {@code false} when the statistics are to be computed.
 * @return The {@link ColumnRearranger} computing the result.
 * @throws InvalidSettingsException Problem with rules.
 */
private static ColumnRearranger createRearranger(final PMMLPortObject obj, final DataTableSpec spec, final boolean replaceColumn, final String outputColumnName, final boolean addConfidence, final String confidenceColumnName, final int validationColumnIdx, final boolean processConcurrently) throws InvalidSettingsException {
    List<Node> models = obj.getPMMLValue().getModels(PMMLModelType.RuleSetModel);
    if (models.size() != 1) {
        throw new InvalidSettingsException("Expected exactly on RuleSetModel, but got: " + models.size());
    }
    final PMMLRuleTranslator translator = new PMMLRuleTranslator();
    obj.initializeModelTranslator(translator);
    if (!translator.isScorable()) {
        throw new UnsupportedOperationException("The model is not scorable.");
    }
    final List<PMMLRuleTranslator.Rule> rules = translator.getRules();
    ColumnRearranger ret = new ColumnRearranger(spec);
    final List<DataColumnSpec> targetCols = obj.getSpec().getTargetCols();
    final DataType dataType = targetCols.isEmpty() ? StringCell.TYPE : targetCols.get(0).getType();
    DataColumnSpecCreator specCreator = new DataColumnSpecCreator(outputColumnName, dataType);
    Set<DataCell> outcomes = new LinkedHashSet<>();
    for (Rule rule : rules) {
        DataCell outcome;
        if (dataType.equals(BooleanCell.TYPE)) {
            outcome = BooleanCellFactory.create(rule.getOutcome());
        } else if (dataType.equals(StringCell.TYPE)) {
            outcome = new StringCell(rule.getOutcome());
        } else if (dataType.equals(DoubleCell.TYPE)) {
            try {
                outcome = new DoubleCell(Double.parseDouble(rule.getOutcome()));
            } catch (NumberFormatException e) {
                // ignore
                continue;
            }
        } else if (dataType.equals(IntCell.TYPE)) {
            try {
                outcome = new IntCell(Integer.parseInt(rule.getOutcome()));
            } catch (NumberFormatException e) {
                // ignore
                continue;
            }
        } else if (dataType.equals(LongCell.TYPE)) {
            try {
                outcome = new LongCell(Long.parseLong(rule.getOutcome()));
            } catch (NumberFormatException e) {
                // ignore
                continue;
            }
        } else {
            throw new UnsupportedOperationException("Unknown outcome type: " + dataType);
        }
        outcomes.add(outcome);
    }
    specCreator.setDomain(new DataColumnDomainCreator(outcomes).createDomain());
    DataColumnSpec colSpec = specCreator.createSpec();
    final RuleSelectionMethod ruleSelectionMethod = translator.getSelectionMethodList().get(0);
    final String defaultScore = translator.getDefaultScore();
    final Double defaultConfidence = translator.getDefaultConfidence();
    final DataColumnSpec[] specs;
    if (addConfidence) {
        specs = new DataColumnSpec[] { new DataColumnSpecCreator(DataTableSpec.getUniqueColumnName(ret.createSpec(), confidenceColumnName), DoubleCell.TYPE).createSpec(), colSpec };
    } else {
        specs = new DataColumnSpec[] { colSpec };
    }
    final int oldColumnIndex = replaceColumn ? ret.indexOf(outputColumnName) : -1;
    ret.append(new AbstractCellFactory(processConcurrently, specs) {

        private final List<String> m_values;

        {
            Map<String, List<String>> dd = translator.getDataDictionary();
            m_values = dd.get(targetCols.get(0).getName());
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public DataCell[] getCells(final DataRow row) {
            // See http://www.dmg.org/v4-1/RuleSet.html#Rule
            switch(ruleSelectionMethod.getCriterion().intValue()) {
                case RuleSelectionMethod.Criterion.INT_FIRST_HIT:
                    {
                        Pair<DataCell, Double> resultAndConfidence = selectFirstHit(row);
                        return toCells(resultAndConfidence);
                    }
                case RuleSelectionMethod.Criterion.INT_WEIGHTED_MAX:
                    {
                        Pair<DataCell, Double> resultAndConfidence = selectWeightedMax(row);
                        return toCells(resultAndConfidence);
                    }
                case RuleSelectionMethod.Criterion.INT_WEIGHTED_SUM:
                    {
                        Pair<DataCell, Double> resultAndConfidence = selectWeightedSum(row);
                        return toCells(resultAndConfidence);
                    }
                default:
                    throw new UnsupportedOperationException(ruleSelectionMethod.getCriterion().toString());
            }
        }

        /**
         * Converts the pair to a {@link DataCell} array.
         *
         * @param resultAndConfidence The {@link Pair}.
         * @return The result and possibly the confidence.
         */
        private DataCell[] toCells(final Pair<DataCell, Double> resultAndConfidence) {
            if (!addConfidence) {
                return new DataCell[] { resultAndConfidence.getFirst() };
            }
            if (resultAndConfidence.getSecond() == null) {
                return new DataCell[] { DataType.getMissingCell(), resultAndConfidence.getFirst() };
            }
            return new DataCell[] { new DoubleCell(resultAndConfidence.getSecond()), resultAndConfidence.getFirst() };
        }

        /**
         * Computes the result and the confidence using the weighted sum method.
         *
         * @param row A {@link DataRow}
         * @return The result and the confidence.
         */
        private Pair<DataCell, Double> selectWeightedSum(final DataRow row) {
            final Map<String, Double> scoreToSumWeight = new LinkedHashMap<String, Double>();
            for (String val : m_values) {
                scoreToSumWeight.put(val, 0.0);
            }
            int matchedRuleCount = 0;
            for (final PMMLRuleTranslator.Rule rule : rules) {
                if (rule.getCondition().evaluate(row, spec) == Boolean.TRUE) {
                    ++matchedRuleCount;
                    Double sumWeight = scoreToSumWeight.get(rule.getOutcome());
                    if (sumWeight == null) {
                        throw new IllegalStateException("The score value: " + rule.getOutcome() + " is not in the data dictionary.");
                    }
                    final Double wRaw = rule.getWeight();
                    final double w = wRaw == null ? 0.0 : wRaw.doubleValue();
                    scoreToSumWeight.put(rule.getOutcome(), sumWeight + w);
                }
            }
            double maxSumWeight = Double.NEGATIVE_INFINITY;
            String bestScore = null;
            for (Entry<String, Double> entry : scoreToSumWeight.entrySet()) {
                final double d = entry.getValue().doubleValue();
                if (d > maxSumWeight) {
                    maxSumWeight = d;
                    bestScore = entry.getKey();
                }
            }
            if (bestScore == null || matchedRuleCount == 0) {
                return pair(result(defaultScore), defaultConfidence);
            }
            return pair(result(bestScore), maxSumWeight / matchedRuleCount);
        }

        /**
         * Helper method to create {@link Pair}s.
         *
         * @param f The first element.
         * @param s The second element.
         * @return The new pair.
         */
        private <F, S> Pair<F, S> pair(final F f, final S s) {
            return new Pair<F, S>(f, s);
        }

        /**
         * Computes the result and the confidence using the weighted max method.
         *
         * @param row A {@link DataRow}
         * @return The result and the confidence.
         */
        private Pair<DataCell, Double> selectWeightedMax(final DataRow row) {
            double maxWeight = Double.NEGATIVE_INFINITY;
            PMMLRuleTranslator.Rule bestRule = null;
            for (final PMMLRuleTranslator.Rule rule : rules) {
                if (rule.getCondition().evaluate(row, spec) == Boolean.TRUE) {
                    if (rule.getWeight() > maxWeight) {
                        maxWeight = rule.getWeight();
                        bestRule = rule;
                    }
                }
            }
            if (bestRule == null) {
                return pair(result(defaultScore), defaultConfidence);
            }
            bestRule.setRecordCount(bestRule.getRecordCount() + 1);
            DataCell result = result(bestRule);
            if (validationColumnIdx >= 0) {
                if (row.getCell(validationColumnIdx).equals(result)) {
                    bestRule.setNbCorrect(bestRule.getNbCorrect() + 1);
                }
            }
            Double confidence = bestRule.getConfidence();
            return pair(result, confidence == null ? defaultConfidence : confidence);
        }

        /**
         * Selects the outcome of the rule and converts it to the proper outcome type.
         *
         * @param rule A {@link Rule}.
         * @return The {@link DataCell} representing the result. (May be missing.)
         */
        private DataCell result(final PMMLRuleTranslator.Rule rule) {
            String outcome = rule.getOutcome();
            return result(outcome);
        }

        /**
         * Constructs the {@link DataCell} from its {@link String} representation ({@code outcome}) and its type.
         *
         * @param dataType The expected {@link DataType}
         * @param outcome The {@link String} representation.
         * @return The {@link DataCell}.
         */
        private DataCell result(final String outcome) {
            if (outcome == null) {
                return DataType.getMissingCell();
            }
            try {
                if (dataType.isCompatible(BooleanValue.class)) {
                    return BooleanCellFactory.create(outcome);
                }
                if (IntCell.TYPE.isASuperTypeOf(dataType)) {
                    return new IntCell(Integer.parseInt(outcome));
                }
                if (LongCell.TYPE.isASuperTypeOf(dataType)) {
                    return new LongCell(Long.parseLong(outcome));
                }
                if (DoubleCell.TYPE.isASuperTypeOf(dataType)) {
                    return new DoubleCell(Double.parseDouble(outcome));
                }
                return new StringCell(outcome);
            } catch (NumberFormatException e) {
                return new MissingCell(outcome + "\n" + e.getMessage());
            }
        }

        /**
         * Selects the first rule that matches and computes the confidence and result for the {@code row}.
         *
         * @param row A {@link DataRow}.
         * @return The result and the confidence.
         */
        private Pair<DataCell, Double> selectFirstHit(final DataRow row) {
            for (final PMMLRuleTranslator.Rule rule : rules) {
                Boolean eval = rule.getCondition().evaluate(row, spec);
                if (eval == Boolean.TRUE) {
                    rule.setRecordCount(rule.getRecordCount() + 1);
                    DataCell result = result(rule);
                    if (validationColumnIdx >= 0) {
                        if (row.getCell(validationColumnIdx).equals(result)) {
                            rule.setNbCorrect(rule.getNbCorrect() + 1);
                        }
                    }
                    Double confidence = rule.getConfidence();
                    return pair(result, confidence == null ? defaultConfidence : confidence);
                }
            }
            return pair(result(defaultScore), defaultConfidence);
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public void afterProcessing() {
            super.afterProcessing();
            obj.getPMMLValue();
            RuleSetModel ruleSet = translator.getOriginalRuleSetModel();
            assert rules.size() == ruleSet.getRuleSet().getSimpleRuleList().size() + ruleSet.getRuleSet().getCompoundRuleList().size();
            if (ruleSet.getRuleSet().getSimpleRuleList().size() == rules.size()) {
                for (int i = 0; i < rules.size(); ++i) {
                    Rule rule = rules.get(i);
                    final SimpleRule simpleRuleArray = ruleSet.getRuleSet().getSimpleRuleArray(i);
                    synchronized (simpleRuleArray) /*synchronized fixes AP-6766 */
                    {
                        simpleRuleArray.setRecordCount(rule.getRecordCount());
                        if (validationColumnIdx >= 0) {
                            simpleRuleArray.setNbCorrect(rule.getNbCorrect());
                        } else if (simpleRuleArray.isSetNbCorrect()) {
                            simpleRuleArray.unsetNbCorrect();
                        }
                    }
                }
            }
        }
    });
    if (replaceColumn) {
        ret.remove(outputColumnName);
        ret.move(ret.getColumnCount() - 1 - (addConfidence ? 1 : 0), oldColumnIndex);
    }
    return ret;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) RuleSetModel(org.dmg.pmml.RuleSetModelDocument.RuleSetModel) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) DoubleCell(org.knime.core.data.def.DoubleCell) Node(org.w3c.dom.Node) SettingsModelString(org.knime.core.node.defaultnodesettings.SettingsModelString) DataRow(org.knime.core.data.DataRow) IntCell(org.knime.core.data.def.IntCell) Entry(java.util.Map.Entry) SimpleRule(org.dmg.pmml.SimpleRuleDocument.SimpleRule) ColumnRearranger(org.knime.core.data.container.ColumnRearranger) DataColumnSpec(org.knime.core.data.DataColumnSpec) BooleanValue(org.knime.core.data.BooleanValue) DataType(org.knime.core.data.DataType) SettingsModelBoolean(org.knime.core.node.defaultnodesettings.SettingsModelBoolean) Pair(org.knime.core.util.Pair) AbstractCellFactory(org.knime.core.data.container.AbstractCellFactory) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator) RuleSelectionMethod(org.dmg.pmml.RuleSelectionMethodDocument.RuleSelectionMethod) Rule(org.knime.base.node.rules.engine.pmml.PMMLRuleTranslator.Rule) LongCell(org.knime.core.data.def.LongCell) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) StringCell(org.knime.core.data.def.StringCell) MissingCell(org.knime.core.data.MissingCell) DataCell(org.knime.core.data.DataCell) SimpleRule(org.dmg.pmml.SimpleRuleDocument.SimpleRule) Rule(org.knime.base.node.rules.engine.pmml.PMMLRuleTranslator.Rule) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap)

Example 2 with MissingCell

use of org.knime.core.data.MissingCell in project knime-core by knime.

the class HistogramColumn method loadNominalHistogramsPrivate.

private static Map<Integer, HistogramNominalModel> loadNominalHistogramsPrivate(final File histogramsGz, final int[] nominalKeysSize) throws IOException, InvalidSettingsException {
    final FileInputStream is = new FileInputStream(histogramsGz);
    final GZIPInputStream inData = new GZIPInputStream(is);
    final ConfigRO config = NodeSettings.loadFromXML(inData);
    Map<Integer, HistogramNominalModel> histograms = new HashMap<Integer, HistogramNominalModel>();
    // .getConfig(HISTOGRAMS);
    ConfigRO hs = config;
    int[] nomColumnIndices = config.getIntArray(NOMINAL_COLUMNS);
    for (int colIdx : nomColumnIndices) {
        Config h = hs.getConfig(HISTOGRAM + colIdx);
        int maxCount = h.getInt(MAX_COUNT);
        int rowCount = h.getInt(ROW_COUNT);
        String colName = h.getString(COL_NAME);
        String[] values = h.getStringArray(BIN_VALUES);
        int[] binCounts = h.getIntArray(BIN_COUNTS);
        Map<DataValue, Integer> bins = new HashMap<DataValue, Integer>();
        for (int i = binCounts.length; i-- > 0; ) {
            if (values[i] == "?") {
                bins.put(new MissingCell(null), binCounts[i]);
            } else {
                bins.put(new StringCell(values[i]), binCounts[i]);
            }
        }
        HistogramNominalModel histogramData = new HistogramNominalModel(bins, colIdx, colName, rowCount);
        histogramData.setMaxCount(maxCount);
        histogramData.setRowCount(rowCount);
        // assert Math.abs(histogramData.m_width - width) < 1e-9: "histogram data width: " + histogramData.m_width + " width: " + width;
        assert nominalKeysSize[colIdx] == bins.size() : "Saved size of nominal bins: " + nominalKeysSize[colIdx] + ", restored from the file: " + bins.size();
        histograms.put(colIdx, histogramData);
    }
    return histograms;
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) DataValue(org.knime.core.data.DataValue) Config(org.knime.core.node.config.Config) FileInputStream(java.io.FileInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) MissingCell(org.knime.core.data.MissingCell) StringCell(org.knime.core.data.def.StringCell) ConfigRO(org.knime.core.node.config.ConfigRO)

Example 3 with MissingCell

use of org.knime.core.data.MissingCell in project knime-core by knime.

the class TestDataGenerator method createNominalAttributeColumn.

public TreeNominalColumnData createNominalAttributeColumn(final String[] values, final String name, final int attributeIndex) {
    DataColumnSpec colSpec = new DataColumnSpecCreator(name, StringCell.TYPE).createSpec();
    TreeNominalColumnDataCreator colCreator = new TreeNominalColumnDataCreator(colSpec);
    for (int i = 0; i < values.length; i++) {
        RowKey rowKey = RowKey.createRowKey((long) i);
        if (values[i].equals("?")) {
            colCreator.add(rowKey, new MissingCell(null));
        } else {
            colCreator.add(rowKey, new StringCell(values[i]));
        }
    }
    TreeNominalColumnData col = colCreator.createColumnData(0, m_config);
    col.getMetaData().setAttributeIndex(attributeIndex);
    return col;
}
Also used : DataColumnSpec(org.knime.core.data.DataColumnSpec) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) RowKey(org.knime.core.data.RowKey) MissingCell(org.knime.core.data.MissingCell) StringCell(org.knime.core.data.def.StringCell)

Example 4 with MissingCell

use of org.knime.core.data.MissingCell in project knime-core by knime.

the class TreeNumericColumnDataTest method createNumericColumnData.

public static TreeOrdinaryNumericColumnData createNumericColumnData(final TreeEnsembleLearnerConfiguration config, final double[] data, final String name, final int attributeIndex) {
    DataColumnSpec colSpec = new DataColumnSpecCreator(name, DoubleCell.TYPE).createSpec();
    TreeOrdinaryNumericColumnDataCreator colCreator = new TreeOrdinaryNumericColumnDataCreator(colSpec);
    for (int i = 0; i < data.length; i++) {
        final RowKey key = RowKey.createRowKey(i);
        if (Double.isNaN(data[i])) {
            colCreator.add(key, new MissingCell(null));
        } else {
            colCreator.add(key, new DoubleCell(data[i]));
        }
    }
    return colCreator.createColumnData(attributeIndex, config);
}
Also used : DataColumnSpec(org.knime.core.data.DataColumnSpec) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) RowKey(org.knime.core.data.RowKey) MissingCell(org.knime.core.data.MissingCell) DoubleCell(org.knime.core.data.def.DoubleCell)

Example 5 with MissingCell

use of org.knime.core.data.MissingCell in project knime-core by knime.

the class DataCellToJavaConversionTest method testCollectionTypes.

/**
 * Test ListCell(IntCell) -> Integer[] conversion.
 *
 * @throws Exception When something went wrong
 */
@Test
public void testCollectionTypes() throws Exception {
    ArrayList<DataCell> coll = new ArrayList<>();
    for (int i = 0; i < 5; ++i) {
        coll.add(new IntCell(i * i));
    }
    // collection cells can always contain missing cells.
    coll.add(new MissingCell("42"));
    final ListCell listCell = CollectionCellFactory.createListCell(coll);
    final Optional<? extends DataCellToJavaConverterFactory<? extends DataValue, Integer[]>> factory = DataCellToJavaConverterRegistry.getInstance().getConverterFactories(listCell.getType(), Integer[].class).stream().findFirst();
    assertTrue(factory.isPresent());
    final DataCellToJavaConverter<DataCell, Integer[]> converter = (DataCellToJavaConverter<DataCell, Integer[]>) factory.get().create();
    assertNotNull(converter);
    final Integer[] array = converter.convert(listCell);
    for (int i = 0; i < 5; ++i) {
        assertEquals(new Integer(i * i), array[i]);
    }
    assertNull(array[5]);
}
Also used : DataCellToJavaConverter(org.knime.core.data.convert.java.DataCellToJavaConverter) MissingCell(org.knime.core.data.MissingCell) ListCell(org.knime.core.data.collection.ListCell) ArrayList(java.util.ArrayList) DataCell(org.knime.core.data.DataCell) BinaryObjectDataCell(org.knime.core.data.blob.BinaryObjectDataCell) IntCell(org.knime.core.data.def.IntCell) Test(org.junit.Test)

Aggregations

MissingCell (org.knime.core.data.MissingCell)18 DataCell (org.knime.core.data.DataCell)13 DataColumnSpec (org.knime.core.data.DataColumnSpec)6 DoubleCell (org.knime.core.data.def.DoubleCell)6 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)5 DefaultRow (org.knime.core.data.def.DefaultRow)5 StringCell (org.knime.core.data.def.StringCell)5 ArrayList (java.util.ArrayList)4 DataRow (org.knime.core.data.DataRow)4 RowKey (org.knime.core.data.RowKey)4 IntCell (org.knime.core.data.def.IntCell)4 LinkedHashMap (java.util.LinkedHashMap)3 DataType (org.knime.core.data.DataType)3 DoubleValue (org.knime.core.data.DoubleValue)3 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)3 IOException (java.io.IOException)2 Test (org.junit.Test)2 BinaryObjectDataCell (org.knime.core.data.blob.BinaryObjectDataCell)2 ListCell (org.knime.core.data.collection.ListCell)2 ColumnRearranger (org.knime.core.data.container.ColumnRearranger)2