Search in sources :

Example 1 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class MemberCounter method loadInstance.

/**
 * Load a member counter from the provided model content.
 *
 * @param model the model content
 * @return the proper initialized member counter
 * @throws InvalidSettingsException if the input settings cannot be parsed
 */
@SuppressWarnings("unchecked")
static MemberCounter loadInstance(final ModelContentRO model) throws InvalidSettingsException {
    // init the counter
    final MemberCounter counter = new MemberCounter();
    // load all the data
    final Enumeration<ModelContentRO> colSettings = model.children();
    while (colSettings.hasMoreElements()) {
        final ModelContentRO colSetting = colSettings.nextElement();
        final String outlierColName = colSetting.getString(CFG_OUT_COL_NAME);
        final Enumeration<ModelContentRO> groupCounts = colSetting.getModelContent(CFG_GROUP_COUNTS).children();
        while (groupCounts.hasMoreElements()) {
            final ModelContentRO groupCount = groupCounts.nextElement();
            final GroupKey key = new GroupKey(groupCount.getDataCellArray(CFG_GROUP_KEY));
            final int count = groupCount.getInt(CFG_GROUP_VAL);
            counter.incrementMemberCount(outlierColName, key, count);
        }
    }
    // return the counter
    return counter;
}
Also used : ModelContentRO(org.knime.core.node.ModelContentRO) GroupKey(org.knime.base.node.preproc.groupby.GroupKey)

Example 2 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class NumericOutliersReviser method replaceOutliers.

/**
 * Replaces outliers found in the row input according to the selected replacement option. Additionally, the outlier
 * replacement counts and new domains are calculated.
 *
 * @param exec the execution context
 * @param in the row input whose outliers have to be treated
 * @param out the row output whose outliers have been treated
 * @param outlierModel the model storing the permitted intervals
 * @param memberCounter the member counter
 * @param outlierRepCounter the outlier replacement counter
 * @param missingGroupsCounter the missing groups counter
 * @throws Exception any exception to indicate an error, cancelation
 */
private void replaceOutliers(final ExecutionContext exec, final RowInput in, final RowOutput out, final NumericOutliersModel outlierModel, final MemberCounter memberCounter, final MemberCounter outlierRepCounter, final MemberCounter missingGroupsCounter) throws Exception {
    // total number of outlier columns
    final int noOutliers = m_outlierColNames.length;
    // the in table spec
    final DataTableSpec inSpec = in.getDataTableSpec();
    // create column re-arranger to overwrite cells corresponding to outliers
    final ColumnRearranger colRearranger = new ColumnRearranger(inSpec);
    // store the positions where the outlier column names can be found in the input table
    final int[] outlierIndices = calculateOutlierIndicies(inSpec);
    final DataColumnSpec[] outlierSpecs = new DataColumnSpec[noOutliers];
    for (int i = 0; i < noOutliers; i++) {
        outlierSpecs[i] = inSpec.getColumnSpec(outlierIndices[i]);
    }
    // values are copied anyways by the re-arranger so there is no need to
    // create new instances for each row
    final DataCell[] treatedVals = new DataCell[noOutliers];
    final AbstractCellFactory fac = new AbstractCellFactory(true, outlierSpecs) {

        @Override
        public DataCell[] getCells(final DataRow row) {
            final GroupKey key = outlierModel.getKey(row, inSpec);
            final Map<String, double[]> colsMap = outlierModel.getGroupIntervals(key);
            for (int i = 0; i < noOutliers; i++) {
                final DataCell curCell = row.getCell(outlierIndices[i]);
                final DataCell treatedCell;
                final String outlierColName = m_outlierColNames[i];
                if (!curCell.isMissing()) {
                    // if the key exists treat the value otherwise we process an unkown group
                    if (colsMap != null) {
                        // increment the member counter
                        memberCounter.incrementMemberCount(outlierColName, key);
                        // treat the value of the cell if its a outlier
                        treatedCell = treatCellValue(colsMap.get(outlierColName), curCell);
                    } else {
                        missingGroupsCounter.incrementMemberCount(outlierColName, key);
                        treatedCell = curCell;
                    }
                } else {
                    treatedCell = curCell;
                }
                // if we changed the value this is an outlier
                if (!treatedCell.equals(curCell)) {
                    outlierRepCounter.incrementMemberCount(outlierColName, key);
                }
                // update the domain if necessary
                if (m_updateDomain && !treatedCell.isMissing()) {
                    m_domainUpdater.updateDomain(outlierColName, ((DoubleValue) treatedCell).getDoubleValue());
                }
                treatedVals[i] = treatedCell;
            }
            return treatedVals;
        }
    };
    // replace the outlier columns by their updated versions
    colRearranger.replace(fac, outlierIndices);
    // stream it
    colRearranger.createStreamableFunction().runFinal(new PortInput[] { in }, new PortOutput[] { out }, exec);
    exec.setProgress(1);
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) AbstractCellFactory(org.knime.core.data.container.AbstractCellFactory) GroupKey(org.knime.base.node.preproc.groupby.GroupKey) DataRow(org.knime.core.data.DataRow) ColumnRearranger(org.knime.core.data.container.ColumnRearranger) DataColumnSpec(org.knime.core.data.DataColumnSpec) DataCell(org.knime.core.data.DataCell)

Example 3 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class NumericOutliersIntervalsCalculator method calcPermittedIntervals.

/**
 * Replaces the first and third quartile values stored in the input table by the permitted interval boundaries.
 * columns and interval boundaries.
 *
 * @param exec the execution context
 * @param quartiles the data table holding the groups, and the first and third quartile for each of the outlier
 *            columns
 * @return the outlier model storing the permitted interval
 * @throws CanceledExecutionException if the user has canceled the execution
 */
private NumericOutliersModel calcPermittedIntervals(final ExecutionContext exec, final BufferedDataTable quartiles) throws CanceledExecutionException {
    final DataTableSpec quartilesSpec = quartiles.getDataTableSpec();
    // the group by table does not rename the group columns so we can use this spec, instead of the
    // in table spec as well (if this is changed the quartilesSpec has to be replaced by the inSpec)
    final NumericOutliersModel model = new NumericOutliersModel(m_groupColNames, m_outlierColNames);
    // first position where outlier columns can be found
    final int outlierOffset = m_groupColNames.length;
    // store counters to update the progress
    final long rowCount = quartiles.size();
    long rowCounter = 1;
    for (final DataRow row : quartiles) {
        exec.checkCanceled();
        // 'final' due to access in lambda expression
        final long rowCounterLong = rowCounter++;
        exec.setProgress(rowCounterLong / (double) rowCount, () -> "Storing interval for row " + rowCounterLong + " of " + rowCount);
        // calculate the groups key
        final GroupKey key = model.getKey(row, quartilesSpec);
        for (int i = 0; i < m_outlierColNames.length; i++) {
            // the permitted interval
            final double[] permInterval;
            // index of the outlier column in the quartiles table
            final int index = i * 2 + outlierOffset;
            // the first quartile cell
            DataCell fQuart = row.getCell(index);
            // the third quartile cell
            DataCell tQuart = row.getCell(index + 1);
            // the entire group consists of Missing Values
            if (!fQuart.isMissing() && !tQuart.isMissing()) {
                // value of the first quartile
                final double fQ = ((DoubleValue) fQuart).getDoubleValue();
                // value of the third quartile
                final double tQ = ((DoubleValue) tQuart).getDoubleValue();
                // calculate the scaled IQR
                final double iqr = m_iqrMultiplier * (tQ - fQ);
                // store the interval
                permInterval = new double[] { fQ - iqr, tQ + iqr };
            } else {
                permInterval = null;
            }
            // setting null here is vital and will be treated by the outlier reviser.
            model.addEntry(key, m_outlierColNames[i], permInterval);
        }
    }
    return model;
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) DoubleValue(org.knime.core.data.DoubleValue) GroupKey(org.knime.base.node.preproc.groupby.GroupKey) DataCell(org.knime.core.data.DataCell) DataRow(org.knime.core.data.DataRow)

Example 4 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class MemberCounter method saveModel.

/**
 * Saves the member counter to the provided model content.
 *
 * @param model the model content to save to
 */
void saveModel(final ModelContentWO model) {
    int gInd = 0;
    for (Entry<String, Map<GroupKey, Integer>> entry : m_groupCounts.entrySet()) {
        final ModelContentWO colSettings = model.addModelContent(CFG_OUT_COL + gInd++);
        colSettings.addString(CFG_OUT_COL_NAME, entry.getKey());
        final ModelContentWO groupCounts = colSettings.addModelContent(CFG_GROUP_COUNTS);
        int oInd = 0;
        for (Entry<GroupKey, Integer> gCountEntry : entry.getValue().entrySet()) {
            final ModelContentWO groupCount = groupCounts.addModelContent(CFG_GROUP_COUNT + oInd++);
            groupCount.addDataCellArray(CFG_GROUP_KEY, gCountEntry.getKey().getGroupVals());
            groupCount.addInt(CFG_GROUP_VAL, gCountEntry.getValue());
        }
    }
}
Also used : ModelContentWO(org.knime.core.node.ModelContentWO) GroupKey(org.knime.base.node.preproc.groupby.GroupKey) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 5 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class NumericOutliersReviser method treatRows.

/**
 * Removes/Retains all rows from the row input that contain outliers. Additionally, the outlier and group related
 * counts, and the new domains are calculated.
 *
 * @param exec the execution context
 * @param in the row input whose outliers have to be treated
 * @param out the row output whose outliers have been treated
 * @param permIntervalsModel the model storing the permitted intervals
 * @param rowCount the row count of the row input
 * @param memberCounter the member counter
 * @param outlierRepCounter the outlier replacement counter
 * @param missingGroupsCounter the missing groups counter
 * @throws CanceledExecutionException if the user has canceled the execution
 * @throws InterruptedException if canceled
 */
private void treatRows(final ExecutionContext exec, final RowInput in, final RowOutput out, final NumericOutliersModel permIntervalsModel, final long rowCount, final MemberCounter memberCounter, final MemberCounter outlierRepCounter, final MemberCounter missingGroupsCounter) throws CanceledExecutionException, InterruptedException {
    // the in spec
    final DataTableSpec inSpec = in.getDataTableSpec();
    // store the positions where the outlier column names can be found in the input table
    final int[] outlierIndices = calculateOutlierIndicies(inSpec);
    // total number of outlier columns
    final int noOutliers = m_outlierColNames.length;
    final double divisor = rowCount;
    long rowCounter = 1;
    // for each row test if it contains an outlier
    DataRow row;
    while ((row = in.poll()) != null) {
        exec.checkCanceled();
        if (rowCount > 0) {
            // 'final' due to access in lambda expression
            final long rowCounterLong = rowCounter++;
            exec.setProgress(rowCounterLong / divisor, () -> "Testing row " + rowCounterLong + " of " + rowCount + " for outliers");
        }
        // get the group key of the currently processed row
        final GroupKey key = permIntervalsModel.getKey(row, inSpec);
        // get the map holding the permitted intervals for the given groups key
        Map<String, double[]> colsMap = permIntervalsModel.getGroupIntervals(key);
        boolean outlierFreeRow = true;
        for (int i = 0; i < noOutliers; i++) {
            final DataCell cell = row.getCell(outlierIndices[i]);
            final String outlierColName = m_outlierColNames[i];
            // if the key is existent check the rows, otherwise increment the missing group counters
            if (colsMap != null) {
                final double[] interval = colsMap.get(outlierColName);
                if (!cell.isMissing()) {
                    // increment the member counter
                    memberCounter.incrementMemberCount(outlierColName, key);
                    final double val = ((DoubleValue) cell).getDoubleValue();
                    // the model might not have learned anything about this key - outlier column combination
                    if (interval != null && isOutlier(interval, val)) {
                        outlierFreeRow = false;
                        // increment the outlier counter
                        outlierRepCounter.incrementMemberCount(outlierColName, key);
                    }
                }
            } else {
                if (!cell.isMissing()) {
                    missingGroupsCounter.incrementMemberCount(outlierColName, key);
                }
            }
        }
        if ((outlierFreeRow && m_treatment == NumericOutliersTreatmentOption.FILTER) || (!outlierFreeRow && m_treatment == NumericOutliersTreatmentOption.RETAIN)) {
            out.push(row);
            // update the domain if necessary
            if (m_updateDomain) {
                DataCell cell;
                for (int i = 0; i < noOutliers; i++) {
                    if (!(cell = row.getCell(outlierIndices[i])).isMissing()) {
                        m_domainUpdater.updateDomain(m_outlierColNames[i], ((DoubleValue) cell).getDoubleValue());
                    }
                }
            }
        }
    }
    out.close();
}
Also used : DataTableSpec(org.knime.core.data.DataTableSpec) GroupKey(org.knime.base.node.preproc.groupby.GroupKey) DataRow(org.knime.core.data.DataRow) DoubleValue(org.knime.core.data.DoubleValue) DataCell(org.knime.core.data.DataCell)

Aggregations

GroupKey (org.knime.base.node.preproc.groupby.GroupKey)7 DataCell (org.knime.core.data.DataCell)3 DataRow (org.knime.core.data.DataRow)3 DataTableSpec (org.knime.core.data.DataTableSpec)3 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2 Map (java.util.Map)2 DoubleValue (org.knime.core.data.DoubleValue)2 ModelContentRO (org.knime.core.node.ModelContentRO)2 ModelContentWO (org.knime.core.node.ModelContentWO)2 DataColumnSpec (org.knime.core.data.DataColumnSpec)1 AbstractCellFactory (org.knime.core.data.container.AbstractCellFactory)1 ColumnRearranger (org.knime.core.data.container.ColumnRearranger)1