Search in sources :

Example 1 with DiscretizationScheme

use of org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme in project knime-core by knime.

the class BinModelPlotter method updatePaintModel.

/**
 * {@inheritDoc}
 */
@Override
public synchronized void updatePaintModel() {
    if (m_discretizationModel == null) {
        return;
    }
    // clear the drawing pane
    ((BinModelDrawingPane) getDrawingPane()).setBinningSchemes(null);
    // get the first columns
    if (m_selectedColumns == null) {
        m_selectedColumns = new LinkedHashSet<String>();
        String[] binnedColumnNames = m_discretizationModel.getIncludedColumnNames();
        for (int i = 0; i < binnedColumnNames.length; i++) {
            // add them to the selected columns
            m_selectedColumns.add(binnedColumnNames[i]);
        }
        ((MultiColumnPlotterProperties) getProperties()).updateColumnSelection(m_binnedColumnsSpec, m_selectedColumns);
    }
    if (m_selectedColumns.size() == 0) {
        getDrawingPane().repaint();
        return;
    }
    Set<DataCell> selectedColumnCells = new LinkedHashSet<DataCell>();
    m_coordinates = new ArrayList<Coordinate>();
    List<Integer> columnIndices = new ArrayList<Integer>();
    for (String name : m_selectedColumns) {
        int idx = m_binnedColumnsSpec.findColumnIndex(name);
        if (idx >= 0) {
            selectedColumnCells.add(new StringCell(name));
            DataColumnSpec colSpec = m_binnedColumnsSpec.getColumnSpec(idx);
            columnIndices.add(idx);
            Coordinate coordinate = Coordinate.createCoordinate(colSpec);
            m_coordinates.add(coordinate);
        }
    }
    // get the binning schemes for the selected columns
    DiscretizationScheme[] selectedSchemes = getSelectedSchemes();
    String[] selectedColumnNames = getSelectedColumnNames();
    // calculate the display coordinates for the drawing pane
    BinRuler[] binRulers = new BinRuler[selectedSchemes.length];
    // determine the width available for a bin ruler
    int rulerWidth = getDrawingPaneDimension().width - 2 * m_hMargin;
    for (int i = 0; i < selectedSchemes.length; i++) {
        double[] bounds = selectedSchemes[i].getBounds();
        double min = bounds[0];
        double max = bounds[bounds.length - 1];
        // first create a colum spec from the schemes
        DataColumnSpecCreator columnSpecCreator = new DataColumnSpecCreator("", DoubleCell.TYPE);
        columnSpecCreator.setDomain(new DataColumnDomainCreator(new DoubleCell(min), new DoubleCell(max)).createDomain());
        DoubleCoordinate coordinate = (DoubleCoordinate) Coordinate.createCoordinate(columnSpecCreator.createSpec());
        Point leftStart = new Point(m_hMargin, m_vMargin + (i + 1) * m_columnDisplayHeight);
        int[] binPositions = new int[bounds.length];
        String[] binLabels = new String[bounds.length];
        int count = 0;
        for (double bound : bounds) {
            binPositions[count] = (int) coordinate.calculateMappedValue(new DoubleCell(bound), rulerWidth, true);
            binLabels[count] = coordinate.formatNumber(bounds[count]);
            count++;
        }
        binRulers[i] = new BinRuler(leftStart, rulerWidth, binPositions, binLabels, selectedColumnNames[i]);
    }
    ((BinModelDrawingPane) getDrawingPane()).setBinningSchemes(binRulers);
    m_hMargin = 10;
    m_vMargin = 10;
    ((BinModelDrawingPane) getDrawingPane()).setHorizontalMargin(m_hMargin);
    setHeight(binRulers[binRulers.length - 1].getLeftStartPoint().y + 40);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) DoubleCell(org.knime.core.data.def.DoubleCell) DiscretizationScheme(org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme) ArrayList(java.util.ArrayList) DoubleCoordinate(org.knime.base.util.coordinate.DoubleCoordinate) DataColumnSpec(org.knime.core.data.DataColumnSpec) MultiColumnPlotterProperties(org.knime.base.node.viz.plotter.columns.MultiColumnPlotterProperties) DataColumnDomainCreator(org.knime.core.data.DataColumnDomainCreator) Point(java.awt.Point) Point(java.awt.Point) DoubleCoordinate(org.knime.base.util.coordinate.DoubleCoordinate) Coordinate(org.knime.base.util.coordinate.Coordinate) StringCell(org.knime.core.data.def.StringCell) DataCell(org.knime.core.data.DataCell)

Example 2 with DiscretizationScheme

use of org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme in project knime-core by knime.

the class CAIMDiscretizationNodeModel method execute.

/**
 * {@inheritDoc}
 */
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
    // measure the time
    long startTime = System.currentTimeMillis();
    // empty model
    if (m_includedColumnNames.getIncludeList() == null || m_includedColumnNames.getIncludeList().size() == 0) {
        return new PortObject[] { inData[0], new DiscretizationModel() };
    }
    LOGGER.debug("Start discretizing.");
    // as the algorithm is for binary class problems only
    // (positive, negative) the algorithm is performed for each class value
    // labeled as positive class and the rest as negative
    exec.setProgress(0.0, "Preparing...");
    // check input data
    BufferedDataTable data = (BufferedDataTable) inData[0];
    // get class column index
    m_classifyColumnIndex = data.getDataTableSpec().findColumnIndex(m_classColumnName.getStringValue());
    assert m_classifyColumnIndex > -1;
    // create the class - index mapping
    createClassFromToIndexMaps(data.getDataTableSpec());
    // create the array with the result discretization schemes for
    // each included column
    DiscretizationScheme[] resultSchemes = new DiscretizationScheme[m_includedColumnNames.getIncludeList().size()];
    // for all included columns do the discretization
    int currentColumn = 0;
    for (String includedColumnName : m_includedColumnNames.getIncludeList()) {
        LOGGER.debug("Process column: " + includedColumnName);
        exec.setProgress("Discretizing column '" + includedColumnName + "'");
        ExecutionContext subExecPerColumn = exec.createSubExecutionContext(1.0D / m_includedColumnNames.getIncludeList().size());
        subExecPerColumn.checkCanceled();
        // never discretize the column index (should never happen)
        if (m_classColumnName.getStringValue().equals(includedColumnName)) {
            continue;
        }
        // determine the column index of the current column
        int columnIndex = data.getDataTableSpec().findColumnIndex(includedColumnName);
        DataColumnDomain domain = data.getDataTableSpec().getColumnSpec(columnIndex).getDomain();
        double minValue = ((DoubleValue) domain.getLowerBound()).getDoubleValue();
        double maxValue = ((DoubleValue) domain.getUpperBound()).getDoubleValue();
        // find all distinct values of the column and create
        // a table with all possible interval boundaries (midpoint value of
        // adjacent values)
        subExecPerColumn.setProgress("Find possible boundaries.");
        BoundaryScheme boundaryScheme = null;
        // create subExec for sorting
        ExecutionContext subExecSort = subExecPerColumn.createSubExecutionContext(0.1);
        // long t1 = System.currentTimeMillis();
        if (m_classOptimizedVersion) {
            boundaryScheme = createAllIntervalBoundaries(data, columnIndex, subExecSort);
        } else {
            boundaryScheme = createAllIntervalBoundaries2(data, columnIndex, subExecSort);
        }
        subExecSort.setProgress(1.0D);
        // long t2 = System.currentTimeMillis() - t1;
        // LOGGER.error("Create boundaries time: " + (t2 / 1000.0)
        // + " optimized: " + m_classOptimizedVersion);
        // LOGGER.error("Boundaries: " + boundaryScheme.getHead());
        LinkedDouble allIntervalBoundaries = boundaryScheme.getHead();
        // create the initial discretization scheme
        DiscretizationScheme discretizationScheme = new DiscretizationScheme(new Interval(minValue, maxValue, true, true));
        double globalCAIM = 0;
        // performe the iterative search for the best intervals
        int numInsertedBounds = 0;
        double currentCAIM = 0;
        // create subExec for inserted bounds
        ExecutionContext subExecBounds = subExecPerColumn.createSubExecutionContext(0.9);
        while (currentCAIM > globalCAIM || numInsertedBounds < m_classValues.length - 1) {
            subExecPerColumn.checkCanceled();
            // create subExec for counting
            ExecutionContext subExecCount = subExecBounds.createSubExecutionContext(1.0D / m_classValues.length);
            // LOGGER.debug("Inserted bounds: " + numInsertedBounds);
            // LOGGER.debug("intervall boundaries: " +
            // allIntervalBoundaries);
            // for all possible interval boundaries
            // insert each one, calculate the caim value and add
            // the one with the biggest caim
            LinkedDouble intervalBoundary = allIntervalBoundaries.m_next;
            currentCAIM = 0;
            LinkedDouble bestBoundary = null;
            long currentCountedBoundaries = 0;
            while (intervalBoundary != null) {
                subExecPerColumn.checkCanceled();
                // set progress
                currentCountedBoundaries++;
                subExecCount.setProgress((double) currentCountedBoundaries / (double) boundaryScheme.getNumBoundaries(), "Count for possible boundary " + currentCountedBoundaries + " of " + boundaryScheme.getNumBoundaries());
                // LOGGER.debug("current caim: " + currentCAIM);
                DiscretizationScheme tentativeDS = new DiscretizationScheme(discretizationScheme);
                tentativeDS.insertBound(intervalBoundary.m_value);
                // create the quanta matrix
                QuantaMatrix2D quantaMatrix = new QuantaMatrix2D(tentativeDS, m_classValueToIndexMap);
                // pass the data for filling the matrix
                quantaMatrix.countData(data, columnIndex, m_classifyColumnIndex);
                // calculate the caim
                double caim = quantaMatrix.calculateCaim();
                if (caim > currentCAIM) {
                    currentCAIM = caim;
                    bestBoundary = intervalBoundary;
                }
                intervalBoundary = intervalBoundary.m_next;
            }
            // if there is no best boundary, break the first while loop
            if (bestBoundary == null) {
                break;
            }
            // in this case accept the best discretization scheme
            if (currentCAIM > globalCAIM || numInsertedBounds < m_classValues.length) {
                int numIntervals = discretizationScheme.getNumIntervals();
                discretizationScheme.insertBound(bestBoundary.m_value);
                // remove the linked list element from the list
                bestBoundary.remove();
                globalCAIM = currentCAIM;
                if (numIntervals < discretizationScheme.getNumIntervals()) {
                    numInsertedBounds++;
                    subExecPerColumn.setProgress("Inserted bound " + numInsertedBounds);
                // LOGGER.debug("Inserted boundary: "
                // + bestBoundary.m_value);
                } else {
                    throw new IllegalStateException("Only usefull bounds should be inserted: " + bestBoundary.m_value);
                }
            }
            subExecCount.setProgress(1.0D);
        }
        resultSchemes[currentColumn] = discretizationScheme;
        subExecBounds.setProgress(1.0D);
        // ensure the full progress is set for this iteration
        subExecPerColumn.setProgress(1.0D);
        currentColumn++;
    }
    // set the model
    DataTableSpec modelSpec = createModelSpec(m_includedColumnNames, data.getDataTableSpec());
    m_discretizationModel = new DiscretizationModel(resultSchemes, modelSpec);
    // create an output table that replaces the included columns by
    // interval values
    BufferedDataTable resultTable = createResultTable(exec, data, m_discretizationModel);
    // log the runtime of the execute method
    long runtime = System.currentTimeMillis() - startTime;
    LOGGER.debug("Binning runtime: " + (runtime / 1000.0) + " sec.");
    return new PortObject[] { resultTable, m_discretizationModel };
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DiscretizationScheme(org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme) SettingsModelFilterString(org.knime.core.node.defaultnodesettings.SettingsModelFilterString) SettingsModelString(org.knime.core.node.defaultnodesettings.SettingsModelString) ExecutionContext(org.knime.core.node.ExecutionContext) DataColumnDomain(org.knime.core.data.DataColumnDomain) DoubleValue(org.knime.core.data.DoubleValue) DiscretizationModel(org.knime.base.node.preproc.discretization.caim2.DiscretizationModel) BufferedDataTable(org.knime.core.node.BufferedDataTable) PortObject(org.knime.core.node.port.PortObject) Interval(org.knime.base.node.preproc.discretization.caim2.Interval)

Example 3 with DiscretizationScheme

use of org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme in project knime-core by knime.

the class CAIMDiscretizationNodeModel method createResultTable.

/**
 * Creates {@link BufferedDataTable} from a given input table and an
 * appropriate {@link DiscretizationScheme}. The result table has replaced
 * columns according to the {@link DiscretizationScheme}.
 *
 * @param exec the context from which to create the
 *            {@link BufferedDataTable}
 * @param table the input data table
 * @param discretizationModel the {@link DiscretizationModel} that contains
 *            the mapping from numerical intervals to nominal String values
 *            for the included columns
 * @return the discretized input data
 */
public static BufferedDataTable createResultTable(final ExecutionContext exec, final BufferedDataTable table, final DiscretizationModel discretizationModel) {
    DiscretizationScheme[] dSchemes = discretizationModel.getSchemes();
    final String[] includedColumnNames = discretizationModel.getIncludedColumnNames();
    // filter the schemes so that only schemes for columns are included
    // which are also included in the table
    dSchemes = filterNotKnownSchemes(dSchemes, includedColumnNames, table.getDataTableSpec());
    DataTableSpec originalTableSpec = table.getDataTableSpec();
    DataColumnSpec[] newColumnSpecs = new DataColumnSpec[originalTableSpec.getNumColumns()];
    // remembers if an column index is included or not
    boolean[] included = new boolean[newColumnSpecs.length];
    int counter = 0;
    for (DataColumnSpec originalColumnSpec : originalTableSpec) {
        // if the column is included for discretizing, change the spec
        if (isIncluded(originalColumnSpec, includedColumnNames) > -1) {
            // creat a nominal string column spec
            newColumnSpecs[counter] = new DataColumnSpecCreator(originalColumnSpec.getName(), StringCell.TYPE).createSpec();
            included[counter] = true;
        } else {
            // add it as is
            newColumnSpecs[counter] = originalColumnSpec;
            included[counter] = false;
        }
        counter++;
    }
    // create the new table spec
    DataTableSpec newTableSpec = new DataTableSpec(newColumnSpecs);
    // create the result table
    BufferedDataContainer container = exec.createDataContainer(newTableSpec);
    // discretize the included column values
    double rowCounter = 0;
    double numRows = table.size();
    for (DataRow row : table) {
        if (rowCounter % 200 == 0) {
            exec.setProgress(rowCounter / numRows);
        }
        int i = 0;
        DataCell[] newCells = new DataCell[row.getNumCells()];
        int includedCounter = 0;
        for (DataCell cell : row) {
            if (included[i]) {
                // check for missing values
                if (cell.isMissing()) {
                    newCells[i] = cell;
                } else {
                    // transform the value to the discretized one
                    double value = ((DoubleValue) cell).getDoubleValue();
                    String discreteValue = dSchemes[includedCounter].getDiscreteValue(value);
                    newCells[i] = new StringCell(discreteValue);
                }
                includedCounter++;
            } else {
                newCells[i] = cell;
            }
            i++;
        }
        container.addRowToTable(new DefaultRow(row.getKey(), newCells));
        rowCounter++;
    }
    container.close();
    return container.getTable();
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) BufferedDataContainer(org.knime.core.node.BufferedDataContainer) DiscretizationScheme(org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme) SettingsModelFilterString(org.knime.core.node.defaultnodesettings.SettingsModelFilterString) SettingsModelString(org.knime.core.node.defaultnodesettings.SettingsModelString) DataRow(org.knime.core.data.DataRow) DataColumnSpec(org.knime.core.data.DataColumnSpec) DoubleValue(org.knime.core.data.DoubleValue) StringCell(org.knime.core.data.def.StringCell) DataCell(org.knime.core.data.DataCell) DefaultRow(org.knime.core.data.def.DefaultRow)

Example 4 with DiscretizationScheme

use of org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme in project knime-core by knime.

the class BinModelPlotter method getSelectedSchemes.

/**
 * Creates an array of {@link DiscretizationScheme}s that contains all
 * schemes for the selected columns.
 *
 * @return the selected discretization schemes
 */
private DiscretizationScheme[] getSelectedSchemes() {
    String[] includedColumns = m_discretizationModel.getIncludedColumnNames();
    DiscretizationScheme[] result = new DiscretizationScheme[m_selectedColumns.size()];
    int counter = 0;
    for (String column : m_selectedColumns) {
        for (int i = 0; i < includedColumns.length; i++) {
            if (includedColumns[i].equals(column)) {
                result[counter] = m_discretizationModel.getSchemes()[i];
                counter++;
            }
        }
    }
    return result;
}
Also used : DiscretizationScheme(org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme) Point(java.awt.Point)

Aggregations

DiscretizationScheme (org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme)4 Point (java.awt.Point)2 DataCell (org.knime.core.data.DataCell)2 DataColumnSpec (org.knime.core.data.DataColumnSpec)2 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)2 DataTableSpec (org.knime.core.data.DataTableSpec)2 DoubleValue (org.knime.core.data.DoubleValue)2 StringCell (org.knime.core.data.def.StringCell)2 SettingsModelFilterString (org.knime.core.node.defaultnodesettings.SettingsModelFilterString)2 SettingsModelString (org.knime.core.node.defaultnodesettings.SettingsModelString)2 ArrayList (java.util.ArrayList)1 LinkedHashSet (java.util.LinkedHashSet)1 DiscretizationModel (org.knime.base.node.preproc.discretization.caim2.DiscretizationModel)1 Interval (org.knime.base.node.preproc.discretization.caim2.Interval)1 MultiColumnPlotterProperties (org.knime.base.node.viz.plotter.columns.MultiColumnPlotterProperties)1 Coordinate (org.knime.base.util.coordinate.Coordinate)1 DoubleCoordinate (org.knime.base.util.coordinate.DoubleCoordinate)1 DataColumnDomain (org.knime.core.data.DataColumnDomain)1 DataColumnDomainCreator (org.knime.core.data.DataColumnDomainCreator)1 DataRow (org.knime.core.data.DataRow)1