Search in sources :

Example 21 with DataColumnDomain

use of org.knime.core.data.DataColumnDomain in project knime-core by knime.

the class RPropNodeModel method configure.

/**
 * returns null.
 *
 * {@inheritDoc}
 */
@Override
protected PortObjectSpec[] configure(final PortObjectSpec[] inSpecs) throws InvalidSettingsException {
    if (m_classcol.getStringValue() != null) {
        List<String> learningCols = new LinkedList<String>();
        List<String> targetCols = new LinkedList<String>();
        boolean classcolinspec = false;
        for (DataColumnSpec colspec : (DataTableSpec) inSpecs[INDATA]) {
            if (!(colspec.getName().toString().compareTo(m_classcol.getStringValue()) == 0)) {
                if (!colspec.getType().isCompatible(DoubleValue.class)) {
                    throw new InvalidSettingsException("Only double columns for input");
                } else {
                    learningCols.add(colspec.getName());
                    DataColumnDomain domain = colspec.getDomain();
                    if (domain.hasBounds()) {
                        double lower = ((DoubleValue) domain.getLowerBound()).getDoubleValue();
                        double upper = ((DoubleValue) domain.getUpperBound()).getDoubleValue();
                        if (lower < 0 || upper > 1) {
                            setWarningMessage("Input data not normalized." + " Please consider using the " + "Normalizer Node first.");
                        }
                    }
                }
            } else {
                targetCols.add(colspec.getName());
                classcolinspec = true;
                // TODO: Check what happens to other values than double
                if (colspec.getType().isCompatible(DoubleValue.class)) {
                    // check if the values are in range [0,1]
                    DataColumnDomain domain = colspec.getDomain();
                    if (domain.hasBounds()) {
                        double lower = ((DoubleValue) domain.getLowerBound()).getDoubleValue();
                        double upper = ((DoubleValue) domain.getUpperBound()).getDoubleValue();
                        if (lower < 0 || upper > 1) {
                            throw new InvalidSettingsException("Domain range for regression in column " + colspec.getName() + " not in range [0,1]");
                        }
                    }
                }
            }
        }
        if (!classcolinspec) {
            throw new InvalidSettingsException("Class column " + m_classcol.getStringValue() + " not found in DataTableSpec");
        }
        return new PortObjectSpec[] { createPMMLPortObjectSpec(m_pmmlInEnabled ? (PMMLPortObjectSpec) inSpecs[1] : null, (DataTableSpec) inSpecs[0], learningCols, targetCols) };
    } else {
        throw new InvalidSettingsException("Class column not set");
    }
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) PMMLPortObjectSpec(org.knime.core.node.port.pmml.PMMLPortObjectSpec) DataColumnSpec(org.knime.core.data.DataColumnSpec) DataColumnDomain(org.knime.core.data.DataColumnDomain) DoubleValue(org.knime.core.data.DoubleValue) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) PMMLPortObjectSpec(org.knime.core.node.port.pmml.PMMLPortObjectSpec) PortObjectSpec(org.knime.core.node.port.PortObjectSpec) SettingsModelString(org.knime.core.node.defaultnodesettings.SettingsModelString) LinkedList(java.util.LinkedList)

Example 22 with DataColumnDomain

use of org.knime.core.data.DataColumnDomain in project knime-core by knime.

the class EditNominalDomainNodeModel method sortPossibleValues.

private DataTableSpec sortPossibleValues(final DataTableSpec orgSpec) throws InvalidSettingsException {
    if (m_configuration == null) {
        throw new InvalidSettingsException("Missing Configuration.");
    }
    Set<String> configuredColumns = new HashSet<String>(m_configuration.getConfiguredColumns());
    String[] columnNames = orgSpec.getColumnNames();
    DataTableSpecCreator creator = new DataTableSpecCreator(orgSpec).dropAllColumns();
    for (int i = 0; i < orgSpec.getNumColumns(); i++) {
        String name = columnNames[i];
        if (configuredColumns.remove(name)) {
            DataColumnSpec orgDataSpec = orgSpec.getColumnSpec(i);
            if (!StringCell.TYPE.equals(orgDataSpec.getType())) {
                CheckUtils.checkSetting(m_configuration.isIgnoreWrongTypes(), "Column '%s' must be of type '%s' \nbut was of type: '%s'", name, StringCell.TYPE, orgDataSpec.getType());
                creator.addColumns(orgDataSpec);
            } else {
                DataColumnDomain domain = orgDataSpec.getDomain();
                DataColumnSpecCreator dataColumnSpecCreator = new DataColumnSpecCreator(orgDataSpec);
                DataColumnDomainCreator yetAnotherCreator = new DataColumnDomainCreator(domain.getLowerBound(), domain.getUpperBound());
                List<DataCell> sorting = new ArrayList<DataCell>(m_configuration.getSorting(name));
                Set<DataCell> difference = diff(domain.getValues(), sorting);
                yetAnotherCreator.setValues(resolveNewValues(sorting, difference));
                dataColumnSpecCreator.setDomain(yetAnotherCreator.createDomain());
                creator.addColumns(dataColumnSpecCreator.createSpec());
            }
        } else {
            creator.addColumns(orgSpec.getColumnSpec(i));
        }
    }
    if (!configuredColumns.isEmpty()) {
        String missingColumnsString = "Following columns are configured but no longer exist: \n" + ConvenienceMethods.getShortStringFrom(configuredColumns, 5);
        CheckUtils.checkSetting(m_configuration.isIgnoreNotExistingColumns(), missingColumnsString);
        setWarningMessage(missingColumnsString);
    }
    return creator.createSpec();
}
Also used : DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) DataTableSpecCreator(org.knime.core.data.DataTableSpecCreator) ArrayList(java.util.ArrayList) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator) DataColumnSpec(org.knime.core.data.DataColumnSpec) DataColumnDomain(org.knime.core.data.DataColumnDomain) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) DataCell(org.knime.core.data.DataCell) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 23 with DataColumnDomain

use of org.knime.core.data.DataColumnDomain in project knime-core by knime.

the class CAIMDiscretizationNodeModel method execute.

/**
 * {@inheritDoc}
 */
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
    // measure the time
    long startTime = System.currentTimeMillis();
    // empty model
    if (m_includedColumnNames.getIncludeList() == null || m_includedColumnNames.getIncludeList().size() == 0) {
        return new PortObject[] { inData[0], new DiscretizationModel() };
    }
    LOGGER.debug("Start discretizing.");
    // as the algorithm is for binary class problems only
    // (positive, negative) the algorithm is performed for each class value
    // labeled as positive class and the rest as negative
    exec.setProgress(0.0, "Preparing...");
    // check input data
    BufferedDataTable data = (BufferedDataTable) inData[0];
    // get class column index
    m_classifyColumnIndex = data.getDataTableSpec().findColumnIndex(m_classColumnName.getStringValue());
    assert m_classifyColumnIndex > -1;
    // create the class - index mapping
    createClassFromToIndexMaps(data.getDataTableSpec());
    // create the array with the result discretization schemes for
    // each included column
    DiscretizationScheme[] resultSchemes = new DiscretizationScheme[m_includedColumnNames.getIncludeList().size()];
    // for all included columns do the discretization
    int currentColumn = 0;
    for (String includedColumnName : m_includedColumnNames.getIncludeList()) {
        LOGGER.debug("Process column: " + includedColumnName);
        exec.setProgress("Discretizing column '" + includedColumnName + "'");
        ExecutionContext subExecPerColumn = exec.createSubExecutionContext(1.0D / m_includedColumnNames.getIncludeList().size());
        subExecPerColumn.checkCanceled();
        // never discretize the column index (should never happen)
        if (m_classColumnName.getStringValue().equals(includedColumnName)) {
            continue;
        }
        // determine the column index of the current column
        int columnIndex = data.getDataTableSpec().findColumnIndex(includedColumnName);
        DataColumnDomain domain = data.getDataTableSpec().getColumnSpec(columnIndex).getDomain();
        double minValue = ((DoubleValue) domain.getLowerBound()).getDoubleValue();
        double maxValue = ((DoubleValue) domain.getUpperBound()).getDoubleValue();
        // find all distinct values of the column and create
        // a table with all possible interval boundaries (midpoint value of
        // adjacent values)
        subExecPerColumn.setProgress("Find possible boundaries.");
        BoundaryScheme boundaryScheme = null;
        // create subExec for sorting
        ExecutionContext subExecSort = subExecPerColumn.createSubExecutionContext(0.1);
        // long t1 = System.currentTimeMillis();
        if (m_classOptimizedVersion) {
            boundaryScheme = createAllIntervalBoundaries(data, columnIndex, subExecSort);
        } else {
            boundaryScheme = createAllIntervalBoundaries2(data, columnIndex, subExecSort);
        }
        subExecSort.setProgress(1.0D);
        // long t2 = System.currentTimeMillis() - t1;
        // LOGGER.error("Create boundaries time: " + (t2 / 1000.0)
        // + " optimized: " + m_classOptimizedVersion);
        // LOGGER.error("Boundaries: " + boundaryScheme.getHead());
        LinkedDouble allIntervalBoundaries = boundaryScheme.getHead();
        // create the initial discretization scheme
        DiscretizationScheme discretizationScheme = new DiscretizationScheme(new Interval(minValue, maxValue, true, true));
        double globalCAIM = 0;
        // performe the iterative search for the best intervals
        int numInsertedBounds = 0;
        double currentCAIM = 0;
        // create subExec for inserted bounds
        ExecutionContext subExecBounds = subExecPerColumn.createSubExecutionContext(0.9);
        while (currentCAIM > globalCAIM || numInsertedBounds < m_classValues.length - 1) {
            subExecPerColumn.checkCanceled();
            // create subExec for counting
            ExecutionContext subExecCount = subExecBounds.createSubExecutionContext(1.0D / m_classValues.length);
            // LOGGER.debug("Inserted bounds: " + numInsertedBounds);
            // LOGGER.debug("intervall boundaries: " +
            // allIntervalBoundaries);
            // for all possible interval boundaries
            // insert each one, calculate the caim value and add
            // the one with the biggest caim
            LinkedDouble intervalBoundary = allIntervalBoundaries.m_next;
            currentCAIM = 0;
            LinkedDouble bestBoundary = null;
            long currentCountedBoundaries = 0;
            while (intervalBoundary != null) {
                subExecPerColumn.checkCanceled();
                // set progress
                currentCountedBoundaries++;
                subExecCount.setProgress((double) currentCountedBoundaries / (double) boundaryScheme.getNumBoundaries(), "Count for possible boundary " + currentCountedBoundaries + " of " + boundaryScheme.getNumBoundaries());
                // LOGGER.debug("current caim: " + currentCAIM);
                DiscretizationScheme tentativeDS = new DiscretizationScheme(discretizationScheme);
                tentativeDS.insertBound(intervalBoundary.m_value);
                // create the quanta matrix
                QuantaMatrix2D quantaMatrix = new QuantaMatrix2D(tentativeDS, m_classValueToIndexMap);
                // pass the data for filling the matrix
                quantaMatrix.countData(data, columnIndex, m_classifyColumnIndex);
                // calculate the caim
                double caim = quantaMatrix.calculateCaim();
                if (caim > currentCAIM) {
                    currentCAIM = caim;
                    bestBoundary = intervalBoundary;
                }
                intervalBoundary = intervalBoundary.m_next;
            }
            // if there is no best boundary, break the first while loop
            if (bestBoundary == null) {
                break;
            }
            // in this case accept the best discretization scheme
            if (currentCAIM > globalCAIM || numInsertedBounds < m_classValues.length) {
                int numIntervals = discretizationScheme.getNumIntervals();
                discretizationScheme.insertBound(bestBoundary.m_value);
                // remove the linked list element from the list
                bestBoundary.remove();
                globalCAIM = currentCAIM;
                if (numIntervals < discretizationScheme.getNumIntervals()) {
                    numInsertedBounds++;
                    subExecPerColumn.setProgress("Inserted bound " + numInsertedBounds);
                // LOGGER.debug("Inserted boundary: "
                // + bestBoundary.m_value);
                } else {
                    throw new IllegalStateException("Only usefull bounds should be inserted: " + bestBoundary.m_value);
                }
            }
            subExecCount.setProgress(1.0D);
        }
        resultSchemes[currentColumn] = discretizationScheme;
        subExecBounds.setProgress(1.0D);
        // ensure the full progress is set for this iteration
        subExecPerColumn.setProgress(1.0D);
        currentColumn++;
    }
    // set the model
    DataTableSpec modelSpec = createModelSpec(m_includedColumnNames, data.getDataTableSpec());
    m_discretizationModel = new DiscretizationModel(resultSchemes, modelSpec);
    // create an output table that replaces the included columns by
    // interval values
    BufferedDataTable resultTable = createResultTable(exec, data, m_discretizationModel);
    // log the runtime of the execute method
    long runtime = System.currentTimeMillis() - startTime;
    LOGGER.debug("Binning runtime: " + (runtime / 1000.0) + " sec.");
    return new PortObject[] { resultTable, m_discretizationModel };
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DiscretizationScheme(org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme) SettingsModelFilterString(org.knime.core.node.defaultnodesettings.SettingsModelFilterString) SettingsModelString(org.knime.core.node.defaultnodesettings.SettingsModelString) ExecutionContext(org.knime.core.node.ExecutionContext) DataColumnDomain(org.knime.core.data.DataColumnDomain) DoubleValue(org.knime.core.data.DoubleValue) DiscretizationModel(org.knime.base.node.preproc.discretization.caim2.DiscretizationModel) BufferedDataTable(org.knime.core.node.BufferedDataTable) PortObject(org.knime.core.node.port.PortObject) Interval(org.knime.base.node.preproc.discretization.caim2.Interval)

Example 24 with DataColumnDomain

use of org.knime.core.data.DataColumnDomain in project knime-core by knime.

the class NominalTable method computeValues.

/**
 * Finds all possible values based on a table and a number of given column
 * indices by iterating through the table.
 *
 * @param table ihe table to get values from
 * @param columnIndex an array of sorted column indices
 * @param exec an object to check if user canceled
 * @return a modified table spec containing all possible values
 * @throws NullPointerException if the table is <code>null</code>
 * @throws IllegalArgumentException if column indices are not sorted
 * @throws IndexOutOfBoundsException if a column index is out of range
 * @throws CanceledExecutionException if user canceled operation
 */
public static final DataTableSpec computeValues(final BufferedDataTable table, final ExecutionMonitor exec, final int... columnIndex) throws CanceledExecutionException {
    DataTableSpec oldSpec = table.getDataTableSpec();
    // keep all possible values for each column (index)
    @SuppressWarnings("unchecked") Set<DataCell>[] set = new Set[columnIndex.length];
    HashSet<Integer> hash = new HashSet<Integer>();
    for (int c = 0; c < columnIndex.length; c++) {
        if (columnIndex[c] == -1) {
            throw new IllegalArgumentException("Column " + columnIndex[c] + " not found.");
        }
        if (hash.contains(columnIndex[c])) {
            throw new IllegalArgumentException("Column indices " + " contain duplicates: " + c);
        }
        if (c > 0 && columnIndex[c - 1] >= columnIndex[c]) {
            throw new IllegalArgumentException("Column indices are " + "not sorted.");
        }
        hash.add(columnIndex[c]);
        set[c] = new HashSet<DataCell>();
    }
    // overall rows in the table
    long rowCount = 0;
    for (DataRow row : table) {
        // get value for column indices
        for (int c = 0; c < columnIndex.length; c++) {
            DataCell cell = row.getCell(columnIndex[c]);
            // adds only each value once
            set[c].add(cell);
        }
        if (exec != null) {
            // throws exception if user canceled
            exec.checkCanceled();
            exec.setProgress((double) ++rowCount / table.size(), "" + row.getKey());
        }
    }
    DataColumnSpec[] newColSpecs = new DataColumnSpec[oldSpec.getNumColumns()];
    // index within the set of possible values
    int idx = 0;
    for (int i = 0; i < newColSpecs.length; i++) {
        DataColumnSpec oldColSpec = oldSpec.getColumnSpec(i);
        if (hash.contains(i)) {
            DataColumnSpecCreator creator = new DataColumnSpecCreator(oldColSpec);
            DataCell lower = null;
            DataCell upper = null;
            if (oldColSpec.getDomain().hasBounds()) {
                lower = oldColSpec.getDomain().getLowerBound();
                upper = oldColSpec.getDomain().getUpperBound();
            } else {
                // TODO DoubleValue is to restrict
                if (oldColSpec.getType().isCompatible(DoubleValue.class)) {
                    TreeSet<DataCell> tSet = new TreeSet<DataCell>(oldColSpec.getType().getComparator());
                    tSet.addAll(set[idx]);
                    lower = tSet.first();
                    upper = tSet.last();
                }
            }
            DataColumnDomain dom = new DataColumnDomainCreator(set[idx], lower, upper).createDomain();
            creator.setDomain(dom);
            newColSpecs[i] = creator.createSpec();
            idx++;
        } else {
            newColSpecs[i] = oldColSpec;
        }
    }
    // create new table spec along with all column specs
    return new DataTableSpec(newColSpecs);
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) Set(java.util.Set) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator) DataRow(org.knime.core.data.DataRow) DataColumnSpec(org.knime.core.data.DataColumnSpec) DataColumnDomain(org.knime.core.data.DataColumnDomain) TreeSet(java.util.TreeSet) DataCell(org.knime.core.data.DataCell) HashSet(java.util.HashSet)

Example 25 with DataColumnDomain

use of org.knime.core.data.DataColumnDomain in project knime-core by knime.

the class MissingValueHandlingTable method createTableSpecPrivate.

/* private helper that assumes the ColSetting to have the right format. */
private static DataTableSpec createTableSpecPrivate(final DataTableSpec spec, final ColSetting[] sets) {
    assert (spec.getNumColumns() == sets.length);
    DataColumnSpec[] newSpecs = new DataColumnSpec[sets.length];
    for (int i = 0; i < sets.length; i++) {
        DataColumnSpec colSpec = spec.getColumnSpec(i);
        DataColumnSpec newSpec = colSpec;
        if (sets[i].getMethod() == ColSetting.METHOD_FIX_VAL) {
            DataColumnDomain dom = colSpec.getDomain();
            Comparator<DataCell> comp = colSpec.getType().getComparator();
            DataCell fixCell = sets[i].getFixCell();
            boolean changed = false;
            DataCell l = dom.getLowerBound();
            // (but rather be null). It may happen anyway, we catch it here
            if (l != null && !l.isMissing() && (comp.compare(fixCell, l) < 0)) {
                changed = true;
                l = fixCell;
            }
            DataCell u = dom.getUpperBound();
            if (u != null && !u.isMissing() && (comp.compare(fixCell, u) > 0)) {
                changed = true;
                u = fixCell;
            }
            Set<DataCell> vals = dom.getValues();
            if (vals != null && !vals.contains(fixCell)) {
                changed = true;
                vals = new LinkedHashSet<DataCell>(vals);
                vals.add(fixCell);
            }
            if (changed) {
                DataColumnDomain newDom = new DataColumnDomainCreator(vals, l, u).createDomain();
                DataColumnSpecCreator c = new DataColumnSpecCreator(colSpec);
                c.setDomain(newDom);
                newSpec = c.createSpec();
            }
        }
        newSpecs[i] = newSpec;
    }
    return new DataTableSpec(newSpecs);
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DataColumnSpec(org.knime.core.data.DataColumnSpec) DataColumnDomain(org.knime.core.data.DataColumnDomain) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) DataCell(org.knime.core.data.DataCell) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator)

Aggregations

DataColumnDomain (org.knime.core.data.DataColumnDomain)46 DataColumnSpec (org.knime.core.data.DataColumnSpec)34 DataCell (org.knime.core.data.DataCell)32 DataTableSpec (org.knime.core.data.DataTableSpec)20 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)16 DoubleValue (org.knime.core.data.DoubleValue)13 DataColumnDomainCreator (org.knime.core.data.DataColumnDomainCreator)12 DataType (org.knime.core.data.DataType)11 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)8 SettingsModelString (org.knime.core.node.defaultnodesettings.SettingsModelString)7 PMMLPortObjectSpec (org.knime.core.node.port.pmml.PMMLPortObjectSpec)6 ArrayList (java.util.ArrayList)5 HashSet (java.util.HashSet)5 LinkedHashSet (java.util.LinkedHashSet)5 DoubleCell (org.knime.core.data.def.DoubleCell)5 HashMap (java.util.HashMap)3 LinkedHashMap (java.util.LinkedHashMap)3 LinkedList (java.util.LinkedList)3 Set (java.util.Set)3 DataRow (org.knime.core.data.DataRow)3