Search in sources :

Example 71 with DoubleValue

use of org.knime.core.data.DoubleValue in project knime-core by knime.

the class CorrelationComputer method calculateStatistics.

/**
 * First scan on the data. Calculates (pair wise) means and std dev
 * and determines the list of distinct values for each categorical column.
 * @param table ...
 * @param exec ...
 * @throws CanceledExecutionException
 */
@SuppressWarnings("unchecked")
public void calculateStatistics(final BufferedDataTable table, final ExecutionContext exec) throws CanceledExecutionException {
    DataTableSpec filterTableSpec = table.getDataTableSpec();
    assert filterTableSpec.equalStructure(m_tableSpec);
    m_possibleValues = new LinkedHashMap[m_categoricalColIndexMap.length];
    for (int i = 0; i < m_possibleValues.length; i++) {
        m_possibleValues[i] = new LinkedHashMap<DataCell, Integer>();
    }
    final int numericColCount = m_numericColIndexMap.length;
    double[][] sumMatrix = new double[numericColCount][numericColCount];
    double[][] sumSqMatrix = new double[numericColCount][numericColCount];
    HalfIntMatrix validCountMatrix = new HalfIntMatrix(numericColCount, true);
    final DataCell[] cells = new DataCell[m_tableSpec.getNumColumns()];
    long rowIndex = 0;
    final long rowCount = table.size();
    for (DataRow r : table) {
        // multiple times, so we buffer it
        for (int i = 0; i < cells.length; i++) {
            cells[i] = r.getCell(i);
        }
        for (int i = 0; i < m_numericColIndexMap.length; i++) {
            DataCell c = cells[m_numericColIndexMap[i]];
            final boolean isMissing = c.isMissing();
            if (isMissing) {
                m_numericsWithMissings.add(m_numericColIndexMap[i]);
            } else {
                final double val = ((DoubleValue) c).getDoubleValue();
                final double valSquare = val * val;
                for (int j = 0; j < m_numericColIndexMap.length; j++) {
                    if (!cells[m_numericColIndexMap[j]].isMissing()) {
                        sumMatrix[i][j] += val;
                        sumSqMatrix[i][j] += valSquare;
                        if (j >= i) {
                            // don't count twice
                            validCountMatrix.add(i, j, 1);
                        }
                    }
                }
            }
        }
        for (int i = 0; i < m_categoricalColIndexMap.length; i++) {
            DataCell c = r.getCell(m_categoricalColIndexMap[i]);
            if (m_possibleValues[i] != null) {
                // note: also take missing value as possible value
                m_possibleValues[i].put(c, null);
                if (m_possibleValues[i].size() > m_maxPossibleValues) {
                    m_possibleValues[i] = null;
                }
            }
        }
        exec.checkCanceled();
        exec.setProgress(rowIndex / (double) rowCount, String.format("Calculating statistics - %d/%d (\"%s\")", rowIndex, rowCount, r.getKey()));
        rowIndex += 1;
    }
    for (LinkedHashMap<DataCell, Integer> map : m_possibleValues) {
        if (map != null) {
            int index = 0;
            for (Map.Entry<DataCell, Integer> entry : map.entrySet()) {
                entry.setValue(index++);
            }
        }
    }
    // sumSqMatrix --> m_numericStdDevMatrix
    for (int i = 0; i < numericColCount; i++) {
        for (int j = 0; j < numericColCount; j++) {
            final int validCount = validCountMatrix.get(i, j);
            if (validCount > 1) {
                double variance = (sumSqMatrix[i][j] - (sumMatrix[i][j] * sumMatrix[i][j]) / validCount) / (validCount - 1);
                if (variance < PMCCPortObjectAndSpec.ROUND_ERROR_OK) {
                    variance = 0.0;
                }
                sumSqMatrix[i][j] = Math.sqrt(variance);
            } else {
                sumSqMatrix[i][j] = 0.0;
            }
            sumMatrix[i][j] = validCount > 0 ? sumMatrix[i][j] / validCount : Double.NaN;
        }
    }
    m_numericMeanMatrix = sumMatrix;
    m_numericStdDevMatrix = sumSqMatrix;
    m_numericValidCountMatrix = validCountMatrix;
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DataRow(org.knime.core.data.DataRow) HalfIntMatrix(org.knime.base.util.HalfIntMatrix) DoubleValue(org.knime.core.data.DoubleValue) DataCell(org.knime.core.data.DataCell) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 72 with DoubleValue

use of org.knime.core.data.DoubleValue in project knime-core by knime.

the class CorrelationComputer method calculateOutput.

/**
 * Second scan on data. Computes the pair wise correlation for numeric
 * columns and reads the contingency tables of pairs of categorical
 * columns into memory.
 * @param table ...
 * @param exec ...
 * @return the output matrix to be turned into the output model
 * @throws CanceledExecutionException
 */
public HalfDoubleMatrix calculateOutput(final BufferedDataTable table, final ExecutionMonitor exec) throws CanceledExecutionException {
    assert table.getDataTableSpec().equalStructure(m_tableSpec);
    int catCount = m_categoricalColIndexMap.length;
    int categoricalPairsCount = (catCount - 1) * catCount / 2;
    // stores all pair-wise contingency tables,
    // contingencyTables[i] == null <--> either column of the corresponding
    // pair has more than m_maxPossibleValues values
    // http://en.wikipedia.org/wiki/Contingency_table
    int[][][] contingencyTables = new int[categoricalPairsCount][][];
    int valIndex = 0;
    for (int i = 0; i < m_categoricalColIndexMap.length; i++) {
        for (int j = i + 1; j < m_categoricalColIndexMap.length; j++) {
            LinkedHashMap<DataCell, Integer> valuesI = m_possibleValues[i];
            LinkedHashMap<DataCell, Integer> valuesJ = m_possibleValues[j];
            if (valuesI != null && valuesJ != null) {
                int iSize = valuesI.size();
                int jSize = valuesJ.size();
                contingencyTables[valIndex] = new int[iSize][jSize];
            }
            valIndex++;
        }
    }
    final int numColumns = m_tableSpec.getNumColumns();
    HalfDoubleMatrix nominatorMatrix = new HalfDoubleMatrix(numColumns, /*includeDiagonal=*/
    false);
    nominatorMatrix.fill(Double.NaN);
    long rowIndex = 0;
    DataCell[] cells = new DataCell[numColumns];
    final long rowCount = table.size();
    for (int i = 0; i < m_numericColIndexMap.length; i++) {
        final double stdDevI = m_numericStdDevMatrix[i][i];
        if (stdDevI == 0.0) {
            for (int j = i + 1; j < m_numericColIndexMap.length; j++) {
                nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], Double.NaN);
            }
            m_numericsWithConstantValues.add(new Pair<Integer, Integer>(m_numericColIndexMap[i], null));
        } else {
            for (int j = i + 1; j < m_numericColIndexMap.length; j++) {
                nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], 0.0);
                final double stdDevJ = m_numericStdDevMatrix[j][j];
                if (stdDevJ == 0.0) {
                    nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], Double.NaN);
                // rest is fixed when j becomes the current value
                // in the outer loop
                } else {
                    double stdDevIUnderJ = m_numericStdDevMatrix[i][j];
                    double stdDevJUnderI = m_numericStdDevMatrix[j][i];
                    if (stdDevIUnderJ == 0.0) {
                        // all values in column i where j is not missing
                        // are constant
                        m_numericsWithConstantValues.add(new Pair<Integer, Integer>(m_numericColIndexMap[i], m_numericColIndexMap[j]));
                        nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], Double.NaN);
                    }
                    if (stdDevJUnderI == 0.0) {
                        // all values in column j where i is not missing
                        // are constant
                        m_numericsWithConstantValues.add(new Pair<Integer, Integer>(m_numericColIndexMap[j], m_numericColIndexMap[i]));
                        nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], Double.NaN);
                    }
                }
            }
        }
    }
    for (DataRow r : table) {
        for (int i = 0; i < cells.length; i++) {
            cells[i] = r.getCell(i);
        }
        for (int i = 0; i < m_numericColIndexMap.length; i++) {
            final DataCell ci = cells[m_numericColIndexMap[i]];
            if (ci.isMissing()) {
                continue;
            }
            if (m_numericStdDevMatrix[i][i] == 0.0) {
                // constant column, reported above
                continue;
            }
            final double di = ((DoubleValue) ci).getDoubleValue();
            for (int j = i + 1; j < m_numericColIndexMap.length; j++) {
                final DataCell cj = cells[m_numericColIndexMap[j]];
                if (cj.isMissing()) {
                    continue;
                }
                final double meanI = m_numericMeanMatrix[i][j];
                final double stdDevI = m_numericStdDevMatrix[i][j];
                final double meanJ = m_numericMeanMatrix[j][i];
                final double stdDevJ = m_numericStdDevMatrix[j][i];
                if (stdDevI == 0.0 || stdDevJ == 0.0) {
                    // reported above
                    continue;
                }
                final double vi = (di - meanI) / stdDevI;
                final double dj = ((DoubleValue) cj).getDoubleValue();
                final double vj = (dj - meanJ) / stdDevJ;
                nominatorMatrix.add(m_numericColIndexMap[i], m_numericColIndexMap[j], vi * vj);
            }
        }
        valIndex = 0;
        for (int i = 0; i < m_categoricalColIndexMap.length; i++) {
            for (int j = i + 1; j < m_categoricalColIndexMap.length; j++, valIndex++) {
                LinkedHashMap<DataCell, Integer> possibleValuesI = m_possibleValues[i];
                LinkedHashMap<DataCell, Integer> possibleValuesJ = m_possibleValues[j];
                if (possibleValuesI == null || possibleValuesJ == null) {
                    continue;
                }
                DataCell ci = r.getCell(m_categoricalColIndexMap[i]);
                DataCell cj = r.getCell(m_categoricalColIndexMap[j]);
                Integer indexI = possibleValuesI.get(ci);
                Integer indexJ = possibleValuesJ.get(cj);
                assert indexI != null && indexI >= 0 : String.format("Value unknown in value list of column \"%s-\": %s", table.getDataTableSpec().getColumnSpec(m_categoricalColIndexMap[i]).getName(), ci);
                assert indexJ != null && indexJ >= 0 : String.format("Value unknown in value list of column \"%s-\": %s", table.getDataTableSpec().getColumnSpec(m_categoricalColIndexMap[j]).getName(), ci);
                contingencyTables[valIndex][indexI][indexJ]++;
            }
        }
        exec.checkCanceled();
        exec.setProgress(rowIndex / (double) rowCount, String.format("Calculating statistics - %d/%d (\"%s\")", rowIndex, rowCount, r.getKey()));
        rowIndex += 1;
    }
    for (int i = 0; i < m_numericColIndexMap.length; i++) {
        for (int j = i + 1; j < m_numericColIndexMap.length; j++) {
            final int trueI = m_numericColIndexMap[i];
            final int trueJ = m_numericColIndexMap[j];
            double t = nominatorMatrix.get(trueI, trueJ);
            if (!Double.isNaN(t)) {
                int validCount = m_numericValidCountMatrix.get(i, j);
                nominatorMatrix.set(trueI, trueJ, t / (validCount - 1));
            }
        }
    }
    valIndex = 0;
    for (int i = 0; i < m_categoricalColIndexMap.length; i++) {
        for (int j = i + 1; j < m_categoricalColIndexMap.length; j++) {
            int[][] contingencyTable = contingencyTables[valIndex];
            double value;
            if (contingencyTable == null) {
                value = Double.NaN;
            } else {
                value = computeCramersV(contingencyTable);
            }
            nominatorMatrix.set(m_categoricalColIndexMap[i], m_categoricalColIndexMap[j], value);
            valIndex++;
        }
    }
    return nominatorMatrix;
}
Also used : DataRow(org.knime.core.data.DataRow) DoubleValue(org.knime.core.data.DoubleValue) HalfDoubleMatrix(org.knime.base.util.HalfDoubleMatrix) DataCell(org.knime.core.data.DataCell)

Example 73 with DoubleValue

use of org.knime.core.data.DoubleValue in project knime-core by knime.

the class Numeric2BitVectorMeanCellFactory method getCell.

/**
 * {@inheritDoc}
 */
@Override
public DataCell getCell(final DataRow row) {
    incrementNrOfRows();
    org.knime.core.data.vector.bitvector.BitVectorCellFactory<? extends DataCell> factory = m_vectorType.getCellFactory(m_columns.length);
    for (int i = 0; i < m_columns.length; i++) {
        final DataCell cell = row.getCell(m_columns[i]);
        if (cell.isMissing()) {
            m_totalNrOf0s++;
            continue;
        }
        if (cell instanceof DoubleValue) {
            double currValue = ((DoubleValue) cell).getDoubleValue();
            if (currValue >= (m_meanFactor * m_meanValues[i])) {
                factory.set(i);
                m_totalNrOf1s++;
            } else {
                m_totalNrOf0s++;
            }
        } else {
            printError(LOGGER, row, "Incompatible type found.");
            return DataType.getMissingCell();
        }
    }
    return factory.createDataCell();
}
Also used : DoubleValue(org.knime.core.data.DoubleValue) DataCell(org.knime.core.data.DataCell)

Example 74 with DoubleValue

use of org.knime.core.data.DoubleValue in project knime-core by knime.

the class MedianTable method medianValues.

/**
 * @param context An {@link ExecutionContext}
 * @return The median values for the columns in the order of the columns specified in the constructor. The values
 *         can be {@link Double#NaN}s in certain circumstances.
 * @throws CanceledExecutionException When cancelled.
 */
public synchronized double[] medianValues(final ExecutionContext context) throws CanceledExecutionException {
    if (m_medians == null) {
        m_medians = new double[m_indices.length];
        int[] validCount = new int[m_indices.length];
        for (DataRow row : m_table) {
            context.checkCanceled();
            for (int i = 0; i < m_indices.length; ++i) {
                int col = m_indices[i];
                final DataCell cell = row.getCell(col);
                if (cell.isMissing()) {
                    if (m_includeMissingValues) {
                        validCount[i]++;
                    }
                } else if (cell instanceof DoubleValue) {
                    DoubleValue dv = (DoubleValue) cell;
                    if (m_includeNaNs) {
                        validCount[i]++;
                    } else if (!Double.isNaN(dv.getDoubleValue())) {
                        validCount[i]++;
                    }
                } else {
                    throw new IllegalStateException("Not a double value: " + cell + " in column: " + m_table.getSpec().getColumnSpec(col).getName());
                }
            }
        }
        List<String> incList = new ArrayList<String>(m_indices.length);
        final String[] columnNames = m_table.getSpec().getColumnNames();
        for (int i : m_indices) {
            incList.add(columnNames[i]);
        }
        // two indices per column that denote the lower and upper index of the median value (or both the same)
        long[][] k = new long[2][m_indices.length];
        for (int i = 0; i < 2; i++) {
            for (int j = 0; j < m_indices.length; j++) {
                k[i][j] = validCount[j] > 0 ? (validCount[j] - 1 + i) / 2 : 0;
            }
        }
        sortOnDisk(context, k);
    }
    return m_medians.clone();
}
Also used : DoubleValue(org.knime.core.data.DoubleValue) ArrayList(java.util.ArrayList) DataCell(org.knime.core.data.DataCell) DataRow(org.knime.core.data.DataRow)

Example 75 with DoubleValue

use of org.knime.core.data.DoubleValue in project knime-core by knime.

the class DoubleMinMax method consumeRow.

/**
 * {@inheritDoc}
 */
@Override
protected void consumeRow(final DataRow dataRow) {
    int index = 0;
    for (int i : getIndices()) {
        DataCell cell = dataRow.getCell(i);
        if (!cell.isMissing()) {
            double val = ((DoubleValue) cell).getDoubleValue();
            if (Double.isNaN(m_min[index]) || val < m_min[index]) {
                if (!m_ignoreInfiniteValues || !Double.isInfinite(val)) {
                    m_min[index] = val;
                }
            }
            if (Double.isNaN(m_max[index]) || val > m_max[index]) {
                if (!m_ignoreInfiniteValues || !Double.isInfinite(val)) {
                    m_max[index] = val;
                }
            }
        }
        index++;
    }
}
Also used : DoubleValue(org.knime.core.data.DoubleValue) DataCell(org.knime.core.data.DataCell)

Aggregations

DoubleValue (org.knime.core.data.DoubleValue)154 DataCell (org.knime.core.data.DataCell)103 DataRow (org.knime.core.data.DataRow)71 DataColumnSpec (org.knime.core.data.DataColumnSpec)38 DataTableSpec (org.knime.core.data.DataTableSpec)38 DoubleCell (org.knime.core.data.def.DoubleCell)32 ArrayList (java.util.ArrayList)26 BufferedDataTable (org.knime.core.node.BufferedDataTable)26 DataType (org.knime.core.data.DataType)23 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)21 LinkedHashMap (java.util.LinkedHashMap)18 IntValue (org.knime.core.data.IntValue)15 HashMap (java.util.HashMap)14 RowIterator (org.knime.core.data.RowIterator)14 RowKey (org.knime.core.data.RowKey)13 DefaultRow (org.knime.core.data.def.DefaultRow)13 DataColumnSpecCreator (org.knime.core.data.DataColumnSpecCreator)12 LongValue (org.knime.core.data.LongValue)10 StringValue (org.knime.core.data.StringValue)10 DateAndTimeValue (org.knime.core.data.date.DateAndTimeValue)10