Search in sources :

Example 16 with SortedTable

use of org.knime.base.data.sort.SortedTable in project knime-core by knime.

the class AutoBinner method execute.

/**
 * Determine bins.
 *
 * @param data the input data
 * @param exec the execution context
 * @return the operation with the discretisation information
 * @throws Exception ...
 */
public PMMLPreprocDiscretize execute(final BufferedDataTable data, final ExecutionContext exec) throws Exception {
    final DataTableSpec spec = data.getDataTableSpec();
    // determine intervals
    if (m_settings.getMethod().equals(Method.fixedNumber)) {
        if (m_settings.getEqualityMethod().equals(EqualityMethod.width)) {
            BufferedDataTable inData = calcDomainBoundsIfNeccessary(data, exec.createSubExecutionContext(0.9), Arrays.asList(m_included));
            init(inData.getDataTableSpec());
            Map<String, double[]> edgesMap = new HashMap<String, double[]>();
            for (String target : m_included) {
                DataTableSpec inSpec = inData.getDataTableSpec();
                DataColumnSpec targetCol = inSpec.getColumnSpec(target);
                // bounds of the domain
                double min = ((DoubleValue) targetCol.getDomain().getLowerBound()).getDoubleValue();
                double max = ((DoubleValue) targetCol.getDomain().getUpperBound()).getDoubleValue();
                // the edges of the bins
                int binCount = m_settings.getBinCount();
                double[] edges = calculateBounds(binCount, min, max);
                if (m_settings.getIntegerBounds()) {
                    edges = toIntegerBoundaries(edges);
                }
                edgesMap.put(target, edges);
            }
            return createDisretizeOp(edgesMap);
        } else {
            // EqualityMethod.equalCount
            Map<String, double[]> edgesMap = new HashMap<String, double[]>();
            for (String target : m_included) {
                int colIndex = data.getDataTableSpec().findColumnIndex(target);
                List<Double> values = new ArrayList<Double>();
                for (DataRow row : data) {
                    if (!row.getCell(colIndex).isMissing()) {
                        values.add(((DoubleValue) row.getCell(colIndex)).getDoubleValue());
                    }
                }
                edgesMap.put(target, findEdgesForEqualCount(values, m_settings.getBinCount()));
            }
            return createDisretizeOp(edgesMap);
        }
    } else if (m_settings.getMethod().equals(Method.sampleQuantiles)) {
        init(spec);
        Map<String, double[]> edgesMap = new LinkedHashMap<String, double[]>();
        final int colCount = m_included.length;
        // contains all numeric columns if include all is set!
        for (String target : m_included) {
            exec.setMessage("Calculating quantiles (column \"" + target + "\")");
            ExecutionContext colSortContext = exec.createSubExecutionContext(0.7 / colCount);
            ExecutionContext colCalcContext = exec.createSubExecutionContext(0.3 / colCount);
            ColumnRearranger singleRearranger = new ColumnRearranger(spec);
            singleRearranger.keepOnly(target);
            BufferedDataTable singleColSorted = colSortContext.createColumnRearrangeTable(data, singleRearranger, colSortContext);
            SortedTable sorted = new SortedTable(singleColSorted, Collections.singletonList(target), new boolean[] { true }, colSortContext);
            colSortContext.setProgress(1.0);
            double[] edges = createEdgesFromQuantiles(sorted.getBufferedDataTable(), colCalcContext, m_settings.getSampleQuantiles());
            colCalcContext.setProgress(1.0);
            exec.clearTable(singleColSorted);
            if (m_settings.getIntegerBounds()) {
                edges = toIntegerBoundaries(edges);
            }
            edgesMap.put(target, edges);
        }
        return createDisretizeOp(edgesMap);
    } else {
        throw new IllegalStateException("Unknown binning method.");
    }
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) DataRow(org.knime.core.data.DataRow) DataColumnSpec(org.knime.core.data.DataColumnSpec) ExecutionContext(org.knime.core.node.ExecutionContext) ColumnRearranger(org.knime.core.data.container.ColumnRearranger) DoubleValue(org.knime.core.data.DoubleValue) SortedTable(org.knime.base.data.sort.SortedTable) BufferedDataTable(org.knime.core.node.BufferedDataTable) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 17 with SortedTable

use of org.knime.base.data.sort.SortedTable in project knime-core by knime.

the class CAIMDiscretizationNodeModel method createAllIntervalBoundaries.

/**
 * Sorts the data table in ascending order on the given column, then all
 * distinct values are determined and finally a new table is created that
 * holds the minimum, the maximum value and the midpoints of all adjacent
 * values. These represent all possible boundaries.
 *
 * @param table the table with the data
 * @param columnIndex the column of interest
 * @param exec the execution context to set the progress
 */
private BoundaryScheme createAllIntervalBoundaries(final BufferedDataTable table, final int columnIndex, final ExecutionContext exec) throws Exception {
    // sort the data according to the column index
    List<String> sortColumn = new ArrayList<String>();
    sortColumn.add(table.getDataTableSpec().getColumnSpec(columnIndex).getName());
    // according to the class column
    if (m_reducedBoundaries) {
        sortColumn.add(m_classColumnName.getStringValue());
    }
    // in ascending order
    // in case the class column is not used as second sort criteria
    // the sort order of field 2 is ignored
    boolean[] sortOrder = new boolean[sortColumn.size()];
    Arrays.fill(sortOrder, true);
    SortedTable sortedTable = new SortedTable(table, sortColumn, sortOrder, m_sortInMemory.getBooleanValue(), exec);
    // the first different value is the minimum value of the sorted list
    RowIterator rowIterator = sortedTable.iterator();
    // get the first valid value (non-missing
    double lastDifferentValue = Double.NaN;
    String firstClassValueOfCurrentValue = null;
    while (rowIterator.hasNext()) {
        DataRow firstRow = rowIterator.next();
        if (!firstRow.getCell(columnIndex).isMissing()) {
            lastDifferentValue = ((DoubleValue) firstRow.getCell(columnIndex)).getDoubleValue();
            // also remember the corresponding class value
            firstClassValueOfCurrentValue = firstRow.getCell(m_classifyColumnIndex).toString();
            break;
        }
    }
    // needed to create a already passed candidate boundary due
    // to a class value change
    double lastChangeValueWithoutNewBoundary = Double.NaN;
    // create the head of the linked double list
    // marked by NaN
    LinkedDouble head = new LinkedDouble(Double.NEGATIVE_INFINITY);
    // set the last added element
    LinkedDouble lastAdded = head;
    // count the number of boundaries
    int numBoundaries = 0;
    // to determine if the class has changed during a single value sequence
    boolean hasClassChanged = false;
    while (rowIterator.hasNext()) {
        DataRow row = rowIterator.next();
        DataCell cell = row.getCell(columnIndex);
        double value = ((DoubleValue) cell).getDoubleValue();
        String classValue = row.getCell(m_classifyColumnIndex).toString();
        if (!hasClassChanged && !firstClassValueOfCurrentValue.equals(classValue)) {
            hasClassChanged = true;
            // i.e. this value is not necessary any more
            if (value != lastDifferentValue) {
                lastChangeValueWithoutNewBoundary = Double.NaN;
            }
        }
        // as long as the values do not change no boundary is added
        if (value != lastDifferentValue) {
            // since the last value change
            if (hasClassChanged) {
                // if the class value has changed since this time
                if (!Double.isNaN(lastChangeValueWithoutNewBoundary)) {
                    // a new boundary is the midpoint
                    double newBoundary = (lastDifferentValue + lastChangeValueWithoutNewBoundary) / 2.0D;
                    // add the new midpoint boundary to the linked list
                    lastAdded.m_next = new LinkedDouble(newBoundary);
                    numBoundaries++;
                    lastAdded.m_next.m_previous = lastAdded;
                    lastAdded = lastAdded.m_next;
                }
                // a new boundary is the midpoint
                double newBoundary = (value + lastDifferentValue) / 2.0D;
                // add the new midpoint boundary to the linked list
                lastAdded.m_next = new LinkedDouble(newBoundary);
                numBoundaries++;
                lastAdded.m_next.m_previous = lastAdded;
                lastAdded = lastAdded.m_next;
                // reset the value
                lastChangeValueWithoutNewBoundary = Double.NaN;
            } else {
                lastChangeValueWithoutNewBoundary = lastDifferentValue;
            }
            // remember the value change
            lastDifferentValue = value;
            // remember the first class value of this first value
            firstClassValueOfCurrentValue = classValue;
            // reset the hasClassChanged value
            hasClassChanged = false;
        }
    }
    return new BoundaryScheme(head, numBoundaries);
}
Also used : ArrayList(java.util.ArrayList) SettingsModelFilterString(org.knime.core.node.defaultnodesettings.SettingsModelFilterString) SettingsModelString(org.knime.core.node.defaultnodesettings.SettingsModelString) DataRow(org.knime.core.data.DataRow) DoubleValue(org.knime.core.data.DoubleValue) SortedTable(org.knime.base.data.sort.SortedTable) RowIterator(org.knime.core.data.RowIterator) DataCell(org.knime.core.data.DataCell)

Example 18 with SortedTable

use of org.knime.base.data.sort.SortedTable in project knime-core by knime.

the class NewJoinerNodeModel method execute.

/**
 * {@inheritDoc}
 */
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
    BufferedDataTable leftTable = inData[0];
    BufferedDataTable rightTable = inData[1];
    m_secondTableColIndex = rightTable.getDataTableSpec().findColumnIndex(m_settings.secondTableColumn());
    if (!NewJoinerSettings.ROW_KEY_IDENTIFIER.equals(m_settings.secondTableColumn()) && (m_secondTableColIndex == -1)) {
        throw new InvalidSettingsException("Join column '" + m_settings.secondTableColumn() + "' not found in second table");
    }
    BufferedDataContainer dc = exec.createDataContainer(createSpec(new DataTableSpec[] { leftTable.getDataTableSpec(), rightTable.getDataTableSpec() }));
    // create a row with missing values for left or full outer joins
    DataCell[] missingCells = new DataCell[rightTable.getDataTableSpec().getNumColumns()];
    for (int i = 0; i < missingCells.length; i++) {
        missingCells[i] = DataType.getMissingCell();
    }
    DataRow missingRow = new DefaultRow(new RowKey(""), missingCells);
    exec.setMessage("Reading first table");
    // build a map for sorting the second table which maps the row keys of
    // the first table to their row number
    final Map<String, Integer> orderMap = buildTableOrdering(leftTable, exec);
    Comparator<DataRow> rowComparator = new Comparator<DataRow>() {

        public int compare(final DataRow o1, final DataRow o2) {
            Integer k1 = orderMap.get(getRightJoinKey(o1));
            Integer k2 = orderMap.get(getRightJoinKey(o2));
            if ((k1 != null) && (k2 != null)) {
                return k1 - k2;
            } else if (k1 != null) {
                return -1;
            } else if (k2 != null) {
                return 1;
            } else {
                return 0;
            }
        }
    };
    // sort the second table based on the key order from the first table
    // non-matching rows are placed at the end
    exec.setMessage("Sorting second table");
    SortedTable rightSortedTable = new SortedTable(rightTable, rowComparator, false, exec.createSubExecutionContext(0.7));
    Iterator<DataRow> lit = leftTable.iterator();
    Iterator<DataRow> rit = rightSortedTable.iterator();
    exec.setMessage("Joining tables");
    final double max;
    boolean lofj = false;
    boolean rofj = false;
    if (JoinMode.InnerJoin.equals(m_settings.joinMode())) {
        max = Math.min(leftTable.getRowCount(), rightTable.getRowCount());
    } else if (JoinMode.LeftOuterJoin.equals(m_settings.joinMode())) {
        max = leftTable.getRowCount();
        lofj = true;
    } else if (JoinMode.RightOuterJoin.equals(m_settings.joinMode())) {
        max = rightTable.getRowCount();
        rofj = true;
    } else {
        max = Math.max(leftTable.getRowCount(), rightTable.getRowCount());
        lofj = true;
        rofj = true;
    }
    // now join the two tables
    int p = 0;
    DataRow lrow = lit.hasNext() ? lit.next() : null;
    DataRow rrow = rit.hasNext() ? rit.next() : null;
    String lkey = (lrow != null) ? lrow.getKey().getString() : null;
    String rkey = (rrow != null) ? getRightJoinKey(rrow) : null;
    outer: while ((lrow != null) && (rrow != null)) {
        exec.checkCanceled();
        String key = lkey.toString();
        if (lkey.equals(rkey)) {
            // loop over all matching rows in the second table
            for (int i = 0; lkey.equals(rkey); i++) {
                dc.addRowToTable(createJoinedRow(key, lrow, rrow));
                exec.setProgress(0.7 + 0.3 * p++ / max);
                if (!rit.hasNext()) {
                    rrow = null;
                    break outer;
                }
                rrow = rit.next();
                rkey = getRightJoinKey(rrow);
                key = lkey.toString() + m_settings.keySuffix() + i;
            }
        } else if (lofj) {
            // no matching row from right table => fill with missing values
            // if left or full outer join is required
            dc.addRowToTable(createJoinedRow(lkey.toString(), lrow, missingRow));
            exec.setProgress(0.7 + 0.3 * p++ / max);
        }
        if (!lit.hasNext()) {
            break outer;
        }
        lrow = lit.next();
        lkey = lrow.getKey().getString();
    }
    if (lit.hasNext() && lofj) {
        // outer join
        while (lit.hasNext()) {
            lrow = lit.next();
            dc.addRowToTable(createJoinedRow(lrow.getKey().toString(), lrow, missingRow));
            exec.setProgress(0.7 + 0.3 * p++ / max);
        }
    } else if ((rrow != null) && rofj) {
        // add remaining non-joined rows from the right table if right or
        // full outer join
        missingCells = new DataCell[leftTable.getDataTableSpec().getNumColumns()];
        for (int i = 0; i < missingCells.length; i++) {
            missingCells[i] = DataType.getMissingCell();
        }
        missingRow = new DefaultRow(new RowKey(""), missingCells);
        boolean warningSet = false;
        while (true) {
            String key = rrow.getKey().toString();
            int c = 0;
            while (true) {
                try {
                    dc.addRowToTable(createJoinedRow(key, missingRow, rrow));
                    exec.setProgress(0.7 + 0.3 * p++ / max);
                    break;
                } catch (DuplicateKeyException ex) {
                    if (++c > 10) {
                        throw ex;
                    }
                    key = key + "_r";
                    if (!warningSet) {
                        setWarningMessage("Encountered and fixed some " + "duplicate row keys at the end of the " + "table");
                        warningSet = true;
                    }
                }
            }
            if (!rit.hasNext()) {
                break;
            }
            rrow = rit.next();
        }
    }
    dc.close();
    return new BufferedDataTable[] { dc.getTable() };
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) BufferedDataContainer(org.knime.core.node.BufferedDataContainer) RowKey(org.knime.core.data.RowKey) DataRow(org.knime.core.data.DataRow) DuplicateKeyException(org.knime.core.util.DuplicateKeyException) Comparator(java.util.Comparator) InvalidSettingsException(org.knime.core.node.InvalidSettingsException) SortedTable(org.knime.base.data.sort.SortedTable) BufferedDataTable(org.knime.core.node.BufferedDataTable) DataCell(org.knime.core.data.DataCell) DefaultRow(org.knime.core.data.def.DefaultRow)

Aggregations

SortedTable (org.knime.base.data.sort.SortedTable)18 DataRow (org.knime.core.data.DataRow)16 BufferedDataTable (org.knime.core.node.BufferedDataTable)13 DataTableSpec (org.knime.core.data.DataTableSpec)12 ArrayList (java.util.ArrayList)11 DataCell (org.knime.core.data.DataCell)10 ExecutionContext (org.knime.core.node.ExecutionContext)10 DefaultRow (org.knime.core.data.def.DefaultRow)9 DataColumnSpec (org.knime.core.data.DataColumnSpec)8 DoubleValue (org.knime.core.data.DoubleValue)8 RowKey (org.knime.core.data.RowKey)8 LinkedHashMap (java.util.LinkedHashMap)7 ColumnRearranger (org.knime.core.data.container.ColumnRearranger)7 DataContainer (org.knime.core.data.container.DataContainer)5 BufferedDataContainer (org.knime.core.node.BufferedDataContainer)5 SettingsModelFilterString (org.knime.core.node.defaultnodesettings.SettingsModelFilterString)5 Map (java.util.Map)4 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)3