Search in sources :

Example 6 with DomainCreatorColumnSelection

use of org.knime.core.data.DomainCreatorColumnSelection in project knime-core by knime.

the class LinReg2Learner method recalcDomainOfLearningFields.

private BufferedDataTable recalcDomainOfLearningFields(final BufferedDataTable data, final PMMLPortObjectSpec inPMMLSpec, final ExecutionContext exec) throws InvalidSettingsException, CanceledExecutionException {
    DataTableDomainCreator domainCreator = new DataTableDomainCreator(data.getDataTableSpec(), new DomainCreatorColumnSelection() {

        @Override
        public boolean dropDomain(final DataColumnSpec colSpec) {
            return true;
        }

        @Override
        public boolean createDomain(final DataColumnSpec colSpec) {
            return colSpec.getType().isCompatible(NominalValue.class) && (m_pmmlOutSpec.getLearningFields().contains(colSpec.getName()) || m_pmmlOutSpec.getTargetFields().contains(colSpec.getName()));
        }
    }, new DomainCreatorColumnSelection() {

        @Override
        public boolean dropDomain(final DataColumnSpec colSpec) {
            return false;
        }

        @Override
        public boolean createDomain(final DataColumnSpec colSpec) {
            return false;
        }
    });
    domainCreator.updateDomain(data, exec);
    DataTableSpec spec = domainCreator.createSpec();
    BufferedDataTable newDataTable = exec.createSpecReplacerTable(data, spec);
    // bug fix 5793, similar to 5580 in LogReg2Learner - ignore columns with too many different values.
    // But because this would change behavior, we cannot drop the domain, which means that even
    // prepending a domain calculator to this node will node help when the column has too many values.
    Set<String> columnWithTooManyDomainValues = new LinkedHashSet<>();
    for (String learningField : m_pmmlOutSpec.getLearningFields()) {
        DataColumnSpec columnSpec = spec.getColumnSpec(learningField);
        if (columnSpec.getType().isCompatible(NominalValue.class) && !columnSpec.getDomain().hasValues()) {
            columnWithTooManyDomainValues.add(learningField);
        }
    }
    // initialize m_learner so that it has the correct DataTableSpec of
    // the input
    init(newDataTable.getDataTableSpec(), inPMMLSpec, columnWithTooManyDomainValues);
    if (!columnWithTooManyDomainValues.isEmpty()) {
        StringBuilder warning = new StringBuilder();
        warning.append(columnWithTooManyDomainValues.size() == 1 ? "Column " : "Columns ");
        warning.append(ConvenienceMethods.getShortStringFrom(columnWithTooManyDomainValues, 5));
        warning.append(columnWithTooManyDomainValues.size() == 1 ? " has " : " have ");
        warning.append("too many different values - will be ignored during training");
        // warning.append("(enforce inclusion by using a domain calculator node before)");
        LOGGER.warn(warning.toString());
        m_warningMessage = (m_warningMessage == null ? "" : m_warningMessage + "\n") + warning.toString();
    }
    return newDataTable;
}
Also used : DataTableDomainCreator(org.knime.core.data.DataTableDomainCreator) LinkedHashSet(java.util.LinkedHashSet) DataTableSpec(org.knime.core.data.DataTableSpec) DataColumnSpec(org.knime.core.data.DataColumnSpec) NominalValue(org.knime.core.data.NominalValue) DomainCreatorColumnSelection(org.knime.core.data.DomainCreatorColumnSelection) BufferedDataTable(org.knime.core.node.BufferedDataTable)

Aggregations

DataColumnSpec (org.knime.core.data.DataColumnSpec)6 DataTableDomainCreator (org.knime.core.data.DataTableDomainCreator)6 DomainCreatorColumnSelection (org.knime.core.data.DomainCreatorColumnSelection)6 LinkedHashSet (java.util.LinkedHashSet)4 DataTableSpec (org.knime.core.data.DataTableSpec)4 NominalValue (org.knime.core.data.NominalValue)4 BufferedDataTable (org.knime.core.node.BufferedDataTable)4 HashSet (java.util.HashSet)2