Search in sources :

Example 6 with SortedTable

use of org.knime.base.data.sort.SortedTable in project knime-core by knime.

the class AutoBinner method execute.

/**
 * Determine bins.
 *
 * @param data the input data
 * @param exec the execution context
 * @return the operation with the discretisation information
 * @throws Exception
 */
public PMMLPreprocDiscretize execute(final BufferedDataTable data, final ExecutionContext exec) throws Exception {
    // Auto configuration when target is not set
    final DataTableSpec spec = data.getDataTableSpec();
    if (null == m_settings.getTargetColumn() || m_settings.getIncludeAll()) {
        addAllNumericCols(spec);
    }
    // determine intervals
    if (m_settings.getMethod().equals(Method.fixedNumber)) {
        BufferedDataTable inData = calcDomainBoundsIfNeccessary(data, exec.createSubExecutionContext(0.9), Arrays.asList(m_settings.getTargetColumn()));
        init(inData.getDataTableSpec());
        Map<String, double[]> edgesMap = new HashMap<String, double[]>();
        for (String target : m_settings.getTargetColumn()) {
            DataTableSpec inSpec = inData.getDataTableSpec();
            DataColumnSpec targetCol = inSpec.getColumnSpec(target);
            // bounds of the domain
            double min = ((DoubleValue) targetCol.getDomain().getLowerBound()).getDoubleValue();
            double max = ((DoubleValue) targetCol.getDomain().getUpperBound()).getDoubleValue();
            // the edges of the bins
            double[] edges = new double[m_settings.getBinCount() + 1];
            edges[0] = min;
            edges[edges.length - 1] = max;
            for (int i = 1; i < edges.length - 1; i++) {
                edges[i] = min + i / (double) m_settings.getBinCount() * (max - min);
            }
            edgesMap.put(target, edges);
        }
        return createDisretizeOp(edgesMap);
    } else if (m_settings.getMethod().equals(Method.sampleQuantiles)) {
        init(spec);
        Map<String, double[]> edgesMap = new LinkedHashMap<String, double[]>();
        final int colCount = m_settings.getTargetColumn().length;
        // contains all numeric columns if include all is set!
        for (String target : m_settings.getTargetColumn()) {
            exec.setMessage("Calculating quantiles (column \"" + target + "\")");
            ExecutionContext colSortContext = exec.createSubExecutionContext(0.7 / colCount);
            ExecutionContext colCalcContext = exec.createSubExecutionContext(0.3 / colCount);
            ColumnRearranger singleRearranger = new ColumnRearranger(spec);
            singleRearranger.keepOnly(target);
            BufferedDataTable singleColSorted = colSortContext.createColumnRearrangeTable(data, singleRearranger, colSortContext);
            SortedTable sorted = new SortedTable(singleColSorted, Collections.singletonList(target), new boolean[] { true }, colSortContext);
            colSortContext.setProgress(1.0);
            double[] edges = createEdgesFromQuantiles(sorted.getBufferedDataTable(), colCalcContext, m_settings.getSampleQuantiles());
            colCalcContext.setProgress(1.0);
            exec.clearTable(singleColSorted);
            edgesMap.put(target, edges);
        }
        return createDisretizeOp(edgesMap);
    } else {
        throw new IllegalStateException("Unknown binning method.");
    }
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) DataColumnSpec(org.knime.core.data.DataColumnSpec) ExecutionContext(org.knime.core.node.ExecutionContext) ColumnRearranger(org.knime.core.data.container.ColumnRearranger) DoubleValue(org.knime.core.data.DoubleValue) SortedTable(org.knime.base.data.sort.SortedTable) BufferedDataTable(org.knime.core.node.BufferedDataTable) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 7 with SortedTable

use of org.knime.base.data.sort.SortedTable in project knime-core by knime.

the class AutoBinner method execute.

/**
 * Determine bins.
 *
 * @param data the input data
 * @param exec the execution context
 * @return the operation with the discretisation information
 * @throws Exception ...
 */
public PMMLPreprocDiscretize execute(final BufferedDataTable data, final ExecutionContext exec) throws Exception {
    // Auto configuration when target is not set
    final DataTableSpec spec = data.getDataTableSpec();
    if (null == m_settings.getTargetColumn() || m_settings.getIncludeAll()) {
        addAllNumericCols(spec);
    }
    // determine intervals
    if (m_settings.getMethod().equals(Method.fixedNumber)) {
        if (m_settings.getEqualityMethod().equals(EqualityMethod.width)) {
            BufferedDataTable inData = calcDomainBoundsIfNeccessary(data, exec.createSubExecutionContext(0.9), Arrays.asList(m_settings.getTargetColumn()));
            init(inData.getDataTableSpec());
            Map<String, double[]> edgesMap = new HashMap<String, double[]>();
            for (String target : m_settings.getTargetColumn()) {
                DataTableSpec inSpec = inData.getDataTableSpec();
                DataColumnSpec targetCol = inSpec.getColumnSpec(target);
                // bounds of the domain
                double min = ((DoubleValue) targetCol.getDomain().getLowerBound()).getDoubleValue();
                double max = ((DoubleValue) targetCol.getDomain().getUpperBound()).getDoubleValue();
                // the edges of the bins
                double[] edges = new double[m_settings.getBinCount() + 1];
                edges[0] = min;
                edges[edges.length - 1] = max;
                for (int i = 1; i < edges.length - 1; i++) {
                    edges[i] = min + i / (double) m_settings.getBinCount() * (max - min);
                }
                if (m_settings.getIntegerBounds()) {
                    edges = toIntegerBounds(edges);
                }
                edgesMap.put(target, edges);
            }
            return createDisretizeOp(edgesMap);
        } else {
            // EqualityMethod.equalCount
            Map<String, double[]> edgesMap = new HashMap<String, double[]>();
            for (String target : m_settings.getTargetColumn()) {
                int colIndex = data.getDataTableSpec().findColumnIndex(target);
                List<Double> values = new ArrayList<Double>();
                for (DataRow row : data) {
                    if (!row.getCell(colIndex).isMissing()) {
                        values.add(((DoubleValue) row.getCell(colIndex)).getDoubleValue());
                    }
                }
                edgesMap.put(target, findEdgesForEqualCount(values, m_settings.getBinCount()));
            }
            return createDisretizeOp(edgesMap);
        }
    } else if (m_settings.getMethod().equals(Method.sampleQuantiles)) {
        init(spec);
        Map<String, double[]> edgesMap = new LinkedHashMap<String, double[]>();
        final int colCount = m_settings.getTargetColumn().length;
        // contains all numeric columns if include all is set!
        for (String target : m_settings.getTargetColumn()) {
            exec.setMessage("Calculating quantiles (column \"" + target + "\")");
            ExecutionContext colSortContext = exec.createSubExecutionContext(0.7 / colCount);
            ExecutionContext colCalcContext = exec.createSubExecutionContext(0.3 / colCount);
            ColumnRearranger singleRearranger = new ColumnRearranger(spec);
            singleRearranger.keepOnly(target);
            BufferedDataTable singleColSorted = colSortContext.createColumnRearrangeTable(data, singleRearranger, colSortContext);
            SortedTable sorted = new SortedTable(singleColSorted, Collections.singletonList(target), new boolean[] { true }, colSortContext);
            colSortContext.setProgress(1.0);
            double[] edges = createEdgesFromQuantiles(sorted.getBufferedDataTable(), colCalcContext, m_settings.getSampleQuantiles());
            colCalcContext.setProgress(1.0);
            exec.clearTable(singleColSorted);
            if (m_settings.getIntegerBounds()) {
                edges = toIntegerBounds(edges);
            }
            edgesMap.put(target, edges);
        }
        return createDisretizeOp(edgesMap);
    } else {
        throw new IllegalStateException("Unknown binning method.");
    }
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) DataRow(org.knime.core.data.DataRow) DataColumnSpec(org.knime.core.data.DataColumnSpec) ExecutionContext(org.knime.core.node.ExecutionContext) ColumnRearranger(org.knime.core.data.container.ColumnRearranger) DoubleValue(org.knime.core.data.DoubleValue) SortedTable(org.knime.base.data.sort.SortedTable) BufferedDataTable(org.knime.core.node.BufferedDataTable) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 8 with SortedTable

use of org.knime.base.data.sort.SortedTable in project knime-core by knime.

the class ColumnToGridNodeModel method execute.

/**
 * {@inheritDoc}
 */
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
    String[] includes = m_configuration.getIncludes();
    String groupColumn = m_configuration.getGroupColumn();
    final ExecutionMonitor mainExec;
    final BufferedDataTable inputTable;
    if (groupColumn != null) {
        exec.setMessage("Sorting input table");
        BufferedDataTable in = inData[0];
        ExecutionContext sortExec = exec.createSubExecutionContext(0.5);
        ColumnRearranger sortFilterRearranger = new ColumnRearranger(in.getDataTableSpec());
        String[] relevantCols = new String[includes.length + 1];
        System.arraycopy(includes, 0, relevantCols, 0, includes.length);
        relevantCols[relevantCols.length - 1] = groupColumn;
        sortFilterRearranger.keepOnly(relevantCols);
        BufferedDataTable toBeSortedTable = exec.createColumnRearrangeTable(in, sortFilterRearranger, exec.createSubProgress(0.0));
        SortedTable sorter = new SortedTable(toBeSortedTable, Collections.singletonList(groupColumn), new boolean[] { true }, sortExec);
        inputTable = sorter.getBufferedDataTable();
        mainExec = exec.createSubProgress(0.5);
    } else {
        inputTable = inData[0];
        mainExec = exec;
    }
    exec.setMessage("Assembling output");
    DataTableSpec spec = inputTable.getDataTableSpec();
    DataTableSpec outSpec = createOutputSpec(spec);
    BufferedDataContainer cont = exec.createDataContainer(outSpec);
    int[] includeIndices = new int[includes.length];
    for (int i = 0; i < includes.length; i++) {
        int index = spec.findColumnIndex(includes[i]);
        includeIndices[i] = index;
    }
    int gridCount = m_configuration.getColCount();
    final int cellCount;
    final int groupColIndex;
    if (groupColumn != null) {
        cellCount = includeIndices.length * gridCount + 1;
        groupColIndex = spec.findColumnIndex(groupColumn);
    } else {
        cellCount = includeIndices.length * gridCount;
        groupColIndex = -1;
    }
    final DataCell[] cells = new DataCell[cellCount];
    PushBackRowIterator it = new PushBackRowIterator(inputTable.iterator());
    long currentRow = 0;
    long totalRows = inputTable.size();
    long currentOutRow = 0;
    DataCell curGroupValue = null;
    while (it.hasNext()) {
        Arrays.fill(cells, DataType.getMissingCell());
        // assign group column (if enabled)
        if (groupColIndex >= 0) {
            DataRow row = it.next();
            curGroupValue = row.getCell(groupColIndex);
            cells[cells.length - 1] = curGroupValue;
            it.pushBack(row);
        }
        for (int grid = 0; grid < gridCount; grid++) {
            if (!it.hasNext()) {
                break;
            }
            DataRow inRow = it.next();
            DataCell groupValue = groupColIndex < 0 ? null : inRow.getCell(groupColIndex);
            if (ConvenienceMethods.areEqual(curGroupValue, groupValue)) {
                mainExec.setProgress(currentRow / (double) totalRows, "Processing row " + currentRow + "/" + totalRows + ": " + inRow.getKey());
                currentRow += 1;
                mainExec.checkCanceled();
                for (int i = 0; i < includeIndices.length; i++) {
                    cells[grid * includeIndices.length + i] = inRow.getCell(includeIndices[i]);
                }
            } else {
                // start new group, i.e. new row
                it.pushBack(inRow);
                break;
            }
        }
        RowKey key = RowKey.createRowKey(currentOutRow++);
        cont.addRowToTable(new DefaultRow(key, cells));
    }
    cont.close();
    return new BufferedDataTable[] { cont.getTable() };
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) BufferedDataContainer(org.knime.core.node.BufferedDataContainer) RowKey(org.knime.core.data.RowKey) SettingsModelFilterString(org.knime.core.node.defaultnodesettings.SettingsModelFilterString) DataRow(org.knime.core.data.DataRow) ExecutionContext(org.knime.core.node.ExecutionContext) ColumnRearranger(org.knime.core.data.container.ColumnRearranger) SortedTable(org.knime.base.data.sort.SortedTable) BufferedDataTable(org.knime.core.node.BufferedDataTable) DataCell(org.knime.core.data.DataCell) ExecutionMonitor(org.knime.core.node.ExecutionMonitor) DefaultRow(org.knime.core.data.def.DefaultRow)

Example 9 with SortedTable

use of org.knime.base.data.sort.SortedTable in project knime-core by knime.

the class ROCCalculator method calculateCurveData.

/**
 * Calculates the ROC curve.
 * @param table the table with the data
 * @param exec the execution context to use for reporting progress
 * @throws CanceledExecutionException when the user cancels the execution
 */
public void calculateCurveData(final BufferedDataTable table, final ExecutionContext exec) throws CanceledExecutionException {
    m_warningMessage = null;
    List<ROCCurve> curves = new ArrayList<ROCCurve>();
    int classIndex = table.getDataTableSpec().findColumnIndex(m_classCol);
    int curvesSize = m_curves.size();
    int size = table.getRowCount();
    if (size == 0) {
        m_warningMessage = "Input table contains no rows";
    }
    BufferedDataContainer outCont = exec.createDataContainer(OUT_SPEC);
    for (int i = 0; i < curvesSize; i++) {
        exec.checkCanceled();
        String c = m_curves.get(i);
        ExecutionContext subExec = exec.createSubExecutionContext(1.0 / curvesSize);
        SortedTable sortedTable = new SortedTable(table, Collections.singletonList(c), new boolean[] { false }, subExec);
        subExec.setProgress(1.0);
        int tp = 0, fp = 0;
        // these contain the coordinates for the plot
        double[] xValues = new double[size + 1];
        double[] yValues = new double[size + 1];
        int k = 0;
        final int scoreColIndex = sortedTable.getDataTableSpec().findColumnIndex(c);
        DataCell lastScore = null;
        for (DataRow row : sortedTable) {
            exec.checkCanceled();
            DataCell realClass = row.getCell(classIndex);
            if (realClass.isMissing() || row.getCell(scoreColIndex).isMissing()) {
                if (m_ignoreMissingValues) {
                    continue;
                } else {
                    m_warningMessage = "Table contains missing values.";
                }
            }
            if (realClass.toString().equals(m_posClass)) {
                tp++;
            } else {
                fp++;
            }
            // around ... the following lines circumvent this.
            if (!row.getCell(scoreColIndex).equals(lastScore)) {
                k++;
                lastScore = row.getCell(scoreColIndex);
            }
            xValues[k] = fp;
            yValues[k] = tp;
        }
        xValues = Arrays.copyOf(xValues, k + 1);
        yValues = Arrays.copyOf(yValues, k + 1);
        for (int j = 0; j <= k; j++) {
            xValues[j] /= fp;
            yValues[j] /= tp;
        }
        xValues[xValues.length - 1] = 1;
        yValues[yValues.length - 1] = 1;
        double area = 0;
        for (k = 1; k < xValues.length; k++) {
            if (xValues[k - 1] < xValues[k]) {
                // magical math: the rectangle + the triangle under
                // the segment xValues[k] to xValues[k - 1]
                area += 0.5 * (xValues[k] - xValues[k - 1]) * (yValues[k] + yValues[k - 1]);
            }
        }
        curves.add(new ROCCurve(c, xValues, yValues, area, m_maxPoints));
        outCont.addRowToTable(new DefaultRow(new RowKey(c.toString()), new DoubleCell(area)));
    }
    m_outCurves = curves;
    outCont.close();
    m_outTable = outCont.getTable();
}
Also used : BufferedDataContainer(org.knime.core.node.BufferedDataContainer) RowKey(org.knime.core.data.RowKey) DoubleCell(org.knime.core.data.def.DoubleCell) ArrayList(java.util.ArrayList) DataRow(org.knime.core.data.DataRow) ExecutionContext(org.knime.core.node.ExecutionContext) SortedTable(org.knime.base.data.sort.SortedTable) DataCell(org.knime.core.data.DataCell) DefaultRow(org.knime.core.data.def.DefaultRow)

Example 10 with SortedTable

use of org.knime.base.data.sort.SortedTable in project knime-core by knime.

the class TargetShufflingNodeModel method execute.

/**
 * {@inheritDoc}
 */
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
    final int colIndex = inData[0].getDataTableSpec().findColumnIndex(m_settings.columnName());
    final String colName = inData[0].getDataTableSpec().getColumnSpec(colIndex).getName();
    // create a new column rearranger from the input table
    ColumnRearranger colRe = new ColumnRearranger(inData[0].getDataTableSpec());
    for (DataColumnSpec c : inData[0].getDataTableSpec()) {
        if (!c.getName().equals(colName)) {
            // remove all columns except the selected one
            colRe.remove(c.getName());
        }
    }
    // append a new column with a random number for each cell
    String uniqueColumnName = DataTableSpec.getUniqueColumnName(inData[0].getDataTableSpec(), "random_col");
    colRe.append(new SingleCellFactory(new DataColumnSpecCreator(uniqueColumnName, LongCell.TYPE).createSpec()) {

        @Override
        public DataCell getCell(final DataRow row) {
            return new LongCell(m_random.nextLong());
        }
    });
    BufferedDataTable toSort = exec.createColumnRearrangeTable(exec.createBufferedDataTable(inData[0], exec), colRe, exec.createSilentSubProgress(.2));
    // sort the random numbers ---> shuffles the sorted column
    List<String> include = new ArrayList<String>();
    include.add(toSort.getDataTableSpec().getColumnSpec(1).getName());
    SortedTable sort = new SortedTable(toSort, include, new boolean[] { true }, exec.createSubExecutionContext(.6));
    final BufferedDataTable sorted = sort.getBufferedDataTable();
    // replace the selected column with the shuffled one
    final DataColumnSpec colSpec = inData[0].getDataTableSpec().getColumnSpec(colIndex);
    ColumnRearranger crea = new ColumnRearranger(inData[0].getDataTableSpec());
    crea.replace(new SingleCellFactory(colSpec) {

        private final CloseableRowIterator m_iterator = sorted.iterator();

        @Override
        public DataCell getCell(final DataRow row) {
            return m_iterator.next().getCell(0);
        }
    }, colName);
    return new BufferedDataTable[] { exec.createColumnRearrangeTable(inData[0], crea, exec.createSubProgress(0.2)) };
}
Also used : DataColumnSpecCreator(org.knime.core.data.DataColumnSpecCreator) ArrayList(java.util.ArrayList) CloseableRowIterator(org.knime.core.data.container.CloseableRowIterator) DataRow(org.knime.core.data.DataRow) ColumnRearranger(org.knime.core.data.container.ColumnRearranger) DataColumnSpec(org.knime.core.data.DataColumnSpec) LongCell(org.knime.core.data.def.LongCell) SortedTable(org.knime.base.data.sort.SortedTable) BufferedDataTable(org.knime.core.node.BufferedDataTable) DataCell(org.knime.core.data.DataCell) SingleCellFactory(org.knime.core.data.container.SingleCellFactory)

Aggregations

SortedTable (org.knime.base.data.sort.SortedTable)18 DataRow (org.knime.core.data.DataRow)16 BufferedDataTable (org.knime.core.node.BufferedDataTable)13 DataTableSpec (org.knime.core.data.DataTableSpec)12 ArrayList (java.util.ArrayList)11 DataCell (org.knime.core.data.DataCell)10 ExecutionContext (org.knime.core.node.ExecutionContext)10 DefaultRow (org.knime.core.data.def.DefaultRow)9 DataColumnSpec (org.knime.core.data.DataColumnSpec)8 DoubleValue (org.knime.core.data.DoubleValue)8 RowKey (org.knime.core.data.RowKey)8 LinkedHashMap (java.util.LinkedHashMap)7 ColumnRearranger (org.knime.core.data.container.ColumnRearranger)7 DataContainer (org.knime.core.data.container.DataContainer)5 BufferedDataContainer (org.knime.core.node.BufferedDataContainer)5 SettingsModelFilterString (org.knime.core.node.defaultnodesettings.SettingsModelFilterString)5 Map (java.util.Map)4 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)3