Search in sources :

Example 6 with StatisticsTable

use of org.knime.base.data.statistics.StatisticsTable in project knime-core by knime.

the class Normalizer method doMinMaxNorm.

/**
 * Does the Min-Max Normalization.
 *
 * @param newmax the new maximum
 * @param newmin the new minimum
 * @param exec an object to check for user cancelations. Can be
 *            <code>null</code>.
 * @throws CanceledExecutionException if user canceled
 * @return normalized DataTable
 */
public AffineTransTable doMinMaxNorm(final double newmax, final double newmin, final ExecutionMonitor exec) throws CanceledExecutionException {
    ExecutionMonitor statisticsExec = exec.createSubProgress(.5);
    StatisticsTable st;
    if (m_table instanceof StatisticsTable) {
        st = (StatisticsTable) m_table;
    } else {
        st = new StatisticsTable(m_table, statisticsExec);
    }
    checkForMissVals(st);
    DataTableSpec spec = st.getDataTableSpec();
    DataCell[] max = st.getMax();
    DataCell[] min = st.getMin();
    final double[] scales = new double[m_colindices.length];
    final double[] transforms = new double[m_colindices.length];
    final double[] mins = new double[m_colindices.length];
    final double[] maxs = new double[m_colindices.length];
    for (int i = 0; i < transforms.length; i++) {
        DataColumnSpec cSpec = spec.getColumnSpec(m_colindices[i]);
        boolean isDouble = cSpec.getType().isCompatible(DoubleValue.class);
        if (!isDouble || max[m_colindices[i]].isMissing()) {
            assert (!isDouble || min[m_colindices[i]].isMissing());
            scales[i] = Double.NaN;
            transforms[i] = Double.NaN;
            mins[i] = Double.NaN;
            maxs[i] = Double.NaN;
        } else {
            // scales and translation to [0,1]
            double maxI = ((DoubleValue) max[m_colindices[i]]).getDoubleValue();
            double minI = ((DoubleValue) min[m_colindices[i]]).getDoubleValue();
            scales[i] = (maxI == minI ? 1 : 1.0 / (maxI - minI));
            transforms[i] = -minI * scales[i];
            // scale and translation to [newmin, newmax]
            scales[i] *= (newmax - newmin);
            transforms[i] *= (newmax - newmin);
            transforms[i] += newmin;
            mins[i] = newmin;
            maxs[i] = newmax;
        }
    }
    String[] includes = getNames();
    String minS = DoubleFormat.formatDouble(newmin);
    String maxS = DoubleFormat.formatDouble(newmax);
    String summary = "Min/Max (" + minS + ", " + maxS + ") normalization " + "on " + includes.length + " column(s)";
    AffineTransConfiguration configuration = new AffineTransConfiguration(includes, scales, transforms, mins, maxs, summary);
    return new AffineTransTable(m_table, configuration);
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) StatisticsTable(org.knime.base.data.statistics.StatisticsTable) DataColumnSpec(org.knime.core.data.DataColumnSpec) DoubleValue(org.knime.core.data.DoubleValue) DataCell(org.knime.core.data.DataCell) ExecutionMonitor(org.knime.core.node.ExecutionMonitor)

Example 7 with StatisticsTable

use of org.knime.base.data.statistics.StatisticsTable in project knime-core by knime.

the class Normalizer method doZScoreNorm.

/**
 * Does the Z-Score Normalization.
 *
 * @param exec an object to check for user cancelations. Can be
 *            <code>null</code>.
 * @throws CanceledExecutionException if user canceled
 * @return the normalized DataTable
 */
public AffineTransTable doZScoreNorm(final ExecutionMonitor exec) throws CanceledExecutionException {
    ExecutionMonitor statisticsExec = exec.createSubProgress(.5);
    StatisticsTable st;
    if (m_table instanceof StatisticsTable) {
        st = (StatisticsTable) m_table;
    } else {
        st = new StatisticsTable(m_table, statisticsExec);
    }
    checkForMissVals(st);
    double[] mean = st.getMean();
    double[] stddev = st.getStandardDeviation();
    final double[] scales = new double[m_colindices.length];
    final double[] transforms = new double[m_colindices.length];
    final double[] mins = new double[m_colindices.length];
    final double[] maxs = new double[m_colindices.length];
    for (int i = 0; i < m_colindices.length; i++) {
        if (Double.isNaN(mean[m_colindices[i]])) {
            scales[i] = Double.NaN;
            transforms[i] = Double.NaN;
        } else {
            scales[i] = (stddev[m_colindices[i]] == 0.0 ? 1.0 : 1.0 / stddev[m_colindices[i]]);
            transforms[i] = -mean[m_colindices[i]] * scales[i];
        }
        mins[i] = Double.NaN;
        maxs[i] = Double.NaN;
    }
    String[] includes = getNames();
    String summary = "Z-Score (Gaussian) normalization on " + includes.length + " column(s)";
    AffineTransConfiguration configuration = new AffineTransConfiguration(includes, scales, transforms, mins, maxs, summary);
    return new AffineTransTable(m_table, configuration);
}
Also used : StatisticsTable(org.knime.base.data.statistics.StatisticsTable) ExecutionMonitor(org.knime.core.node.ExecutionMonitor)

Example 8 with StatisticsTable

use of org.knime.base.data.statistics.StatisticsTable in project knime-core by knime.

the class PMCCNodeModel method execute.

/**
 * {@inheritDoc}
 */
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
    final BufferedDataTable in = (BufferedDataTable) inData[0];
    // floating point operation
    final double rC = in.getRowCount();
    int[] includes = getIncludes(in.getDataTableSpec());
    String[] includeNames = m_columnIncludesList.getIncludeList().toArray(new String[0]);
    double progNormalize = 0.3;
    double progDetermine = 0.65;
    double progFinish = 1.0 - progNormalize - progDetermine;
    exec.setMessage("Normalizing data");
    final ExecutionMonitor normProg = exec.createSubProgress(progNormalize);
    FilterColumnTable filterTable = new FilterColumnTable(in, includes);
    final int l = includes.length;
    int nomCount = (l - 1) * l / 2;
    final HalfDoubleMatrix nominatorMatrix = new HalfDoubleMatrix(includes.length, /*withDiagonal*/
    false);
    nominatorMatrix.fill(Double.NaN);
    @SuppressWarnings("unchecked") final LinkedHashMap<DataCell, Integer>[] possibleValues = new LinkedHashMap[l];
    DataTableSpec filterTableSpec = filterTable.getDataTableSpec();
    for (int i = 0; i < l; i++) {
        DataColumnSpec cs = filterTableSpec.getColumnSpec(i);
        if (cs.getType().isCompatible(NominalValue.class)) {
            possibleValues[i] = new LinkedHashMap<DataCell, Integer>();
        }
    }
    final int possValueUpperBound = m_maxPossValueCountModel.getIntValue();
    // determines possible values. We can't use those from the domain
    // as the domain can also contain values not present in the data
    // but in the contingency table we need rows/columns to have at least
    // one cell with a value >= 1
    StatisticsTable statTable = new StatisticsTable(filterTable) {

        // that is sort of the constructor in this derived class
        {
            calculateAllMoments(in.getRowCount(), normProg);
        }

        @Override
        protected void calculateMomentInSubClass(final DataRow row) {
            for (int i = 0; i < l; i++) {
                if (possibleValues[i] != null) {
                    DataCell c = row.getCell(i);
                    // note: also take missing value as possible value
                    possibleValues[i].put(c, null);
                    if (possibleValues[i].size() > possValueUpperBound) {
                        possibleValues[i] = null;
                    }
                }
            }
        }
    };
    for (LinkedHashMap<DataCell, Integer> map : possibleValues) {
        if (map != null) {
            int index = 0;
            for (Map.Entry<DataCell, Integer> entry : map.entrySet()) {
                entry.setValue(index++);
            }
        }
    }
    // stores all pair-wise contingency tables,
    // contingencyTables[i] == null <--> either column of the corresponding
    // pair is non-categorical.
    // What is a contingency table?
    // http://en.wikipedia.org/wiki/Contingency_table
    int[][][] contingencyTables = new int[nomCount][][];
    // column which only contain one value - no correlation available
    LinkedHashSet<String> constantColumns = new LinkedHashSet<String>();
    int valIndex = 0;
    for (int i = 0; i < l; i++) {
        for (int j = i + 1; j < l; j++) {
            if (possibleValues[i] != null && possibleValues[j] != null) {
                int iSize = possibleValues[i].size();
                int jSize = possibleValues[j].size();
                contingencyTables[valIndex] = new int[iSize][jSize];
            }
            DataColumnSpec colSpecI = filterTableSpec.getColumnSpec(i);
            DataColumnSpec colSpecJ = filterTableSpec.getColumnSpec(j);
            DataType ti = colSpecI.getType();
            DataType tj = colSpecJ.getType();
            if (ti.isCompatible(DoubleValue.class) && tj.isCompatible(DoubleValue.class)) {
                // one of the two columns contains only one value
                if (statTable.getVariance(i) < PMCCPortObjectAndSpec.ROUND_ERROR_OK) {
                    constantColumns.add(colSpecI.getName());
                    nominatorMatrix.set(i, j, Double.NaN);
                } else if (statTable.getVariance(j) < PMCCPortObjectAndSpec.ROUND_ERROR_OK) {
                    constantColumns.add(colSpecJ.getName());
                    nominatorMatrix.set(i, j, Double.NaN);
                } else {
                    nominatorMatrix.set(i, j, 0.0);
                }
            }
            valIndex++;
        }
    }
    // to other column (will be a missing value)
    if (!constantColumns.isEmpty()) {
        String[] constantColumnNames = constantColumns.toArray(new String[constantColumns.size()]);
        NodeLogger.getLogger(getClass()).info("The following numeric " + "columns contain only one distinct value or have " + "otherwise a low standard deviation: " + Arrays.toString(constantColumnNames));
        int maxLength = 4;
        if (constantColumns.size() > maxLength) {
            constantColumnNames = Arrays.copyOf(constantColumnNames, maxLength);
            constantColumnNames[maxLength - 1] = "...";
        }
        setWarningMessage("Some columns contain only one distinct value: " + Arrays.toString(constantColumnNames));
    }
    DataTable att;
    if (statTable.getNrRows() > 0) {
        att = new Normalizer(statTable, includeNames).doZScoreNorm(// no iteration needed
        exec.createSubProgress(0.0));
    } else {
        att = statTable;
    }
    normProg.setProgress(1.0);
    exec.setMessage("Calculating correlation measure");
    ExecutionMonitor detProg = exec.createSubProgress(progDetermine);
    int rowIndex = 0;
    double[] buf = new double[l];
    DataCell[] catBuf = new DataCell[l];
    boolean containsMissing = false;
    for (DataRow r : att) {
        detProg.checkCanceled();
        for (int i = 0; i < l; i++) {
            catBuf[i] = null;
            buf[i] = Double.NaN;
            DataCell c = r.getCell(i);
            // missing value is also a possible value here
            if (possibleValues[i] != null) {
                catBuf[i] = c;
            } else if (c.isMissing()) {
                containsMissing = true;
            } else if (filterTableSpec.getColumnSpec(i).getType().isCompatible(DoubleValue.class)) {
                buf[i] = ((DoubleValue) c).getDoubleValue();
            }
        }
        valIndex = 0;
        for (int i = 0; i < l; i++) {
            for (int j = i + 1; j < l; j++) {
                double b1 = buf[i];
                double b2 = buf[j];
                if (!Double.isNaN(b1) && !Double.isNaN(b2)) {
                    double old = nominatorMatrix.get(i, j);
                    nominatorMatrix.set(i, j, old + b1 * b2);
                } else if (catBuf[i] != null && catBuf[j] != null) {
                    int iIndex = possibleValues[i].get(catBuf[i]);
                    assert iIndex >= 0 : "Value unknown in value list " + "of column " + includeNames[i] + ": " + catBuf[i];
                    int jIndex = possibleValues[j].get(catBuf[j]);
                    assert jIndex >= 0 : "Value unknown in value list " + "of column " + includeNames[j] + ": " + catBuf[j];
                    contingencyTables[valIndex][iIndex][jIndex]++;
                }
                valIndex++;
            }
        }
        rowIndex++;
        detProg.setProgress(rowIndex / rC, "Processing row " + rowIndex + " (\"" + r.getKey() + "\")");
    }
    if (containsMissing) {
        setWarningMessage("Some row(s) contained missing values.");
    }
    detProg.setProgress(1.0);
    double normalizer = 1.0 / (rC - 1.0);
    valIndex = 0;
    for (int i = 0; i < l; i++) {
        for (int j = i + 1; j < l; j++) {
            if (contingencyTables[valIndex] != null) {
                nominatorMatrix.set(i, j, computeCramersV(contingencyTables[valIndex]));
            } else if (!Double.isNaN(nominatorMatrix.get(i, j))) {
                double old = nominatorMatrix.get(i, j);
                nominatorMatrix.set(i, j, old * normalizer);
            }
            // else pair of columns is double - string (for instance)
            valIndex++;
        }
    }
    normProg.setProgress(progDetermine);
    PMCCPortObjectAndSpec pmccModel = new PMCCPortObjectAndSpec(includeNames, nominatorMatrix);
    ExecutionContext subExec = exec.createSubExecutionContext(progFinish);
    BufferedDataTable out = pmccModel.createCorrelationMatrix(subExec);
    m_correlationTable = out;
    return new PortObject[] { out, pmccModel };
}
Also used : LinkedHashSet(java.util.LinkedHashSet) DataTable(org.knime.core.data.DataTable) BufferedDataTable(org.knime.core.node.BufferedDataTable) DataTableSpec(org.knime.core.data.DataTableSpec) FilterColumnTable(org.knime.base.data.filter.column.FilterColumnTable) StatisticsTable(org.knime.base.data.statistics.StatisticsTable) SettingsModelFilterString(org.knime.core.node.defaultnodesettings.SettingsModelFilterString) DataRow(org.knime.core.data.DataRow) LinkedHashMap(java.util.LinkedHashMap) DataColumnSpec(org.knime.core.data.DataColumnSpec) BufferedDataTable(org.knime.core.node.BufferedDataTable) DataType(org.knime.core.data.DataType) ExecutionMonitor(org.knime.core.node.ExecutionMonitor) PortObject(org.knime.core.node.port.PortObject) Normalizer(org.knime.base.data.normalize.Normalizer) ExecutionContext(org.knime.core.node.ExecutionContext) DoubleValue(org.knime.core.data.DoubleValue) HalfDoubleMatrix(org.knime.base.util.HalfDoubleMatrix) DataCell(org.knime.core.data.DataCell) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

StatisticsTable (org.knime.base.data.statistics.StatisticsTable)8 DataTableSpec (org.knime.core.data.DataTableSpec)6 BufferedDataTable (org.knime.core.node.BufferedDataTable)5 ExecutionMonitor (org.knime.core.node.ExecutionMonitor)5 DataColumnSpec (org.knime.core.data.DataColumnSpec)4 DataRow (org.knime.core.data.DataRow)3 DataTable (org.knime.core.data.DataTable)3 DoubleValue (org.knime.core.data.DoubleValue)3 LinkedHashSet (java.util.LinkedHashSet)2 DataCell (org.knime.core.data.DataCell)2 BufferedDataContainer (org.knime.core.node.BufferedDataContainer)2 InvalidSettingsException (org.knime.core.node.InvalidSettingsException)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 FilterColumnTable (org.knime.base.data.filter.column.FilterColumnTable)1 Normalizer (org.knime.base.data.normalize.Normalizer)1 HalfDoubleMatrix (org.knime.base.util.HalfDoubleMatrix)1 DataType (org.knime.core.data.DataType)1