Search in sources :

Example 1 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class MemberCounter method loadInstance.

 * Load a member counter from the provided model content.
 * @param model the model content
 * @return the proper initialized member counter
 * @throws InvalidSettingsException if the input settings cannot be parsed
static MemberCounter loadInstance(final ModelContentRO model) throws InvalidSettingsException {
    // init the counter
    final MemberCounter counter = new MemberCounter();
    // load all the data
    final Enumeration<ModelContentRO> colSettings = model.children();
    while (colSettings.hasMoreElements()) {
        final ModelContentRO colSetting = colSettings.nextElement();
        final String outlierColName = colSetting.getString(CFG_OUT_COL_NAME);
        final Enumeration<ModelContentRO> groupCounts = colSetting.getModelContent(CFG_GROUP_COUNTS).children();
        while (groupCounts.hasMoreElements()) {
            final ModelContentRO groupCount = groupCounts.nextElement();
            final GroupKey key = new GroupKey(groupCount.getDataCellArray(CFG_GROUP_KEY));
            final int count = groupCount.getInt(CFG_GROUP_VAL);
            counter.incrementMemberCount(outlierColName, key, count);
    // return the counter
    return counter;
Also used : ModelContentRO(org.knime.core.node.ModelContentRO) GroupKey(org.knime.base.node.preproc.groupby.GroupKey)

Example 2 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class NumericOutliersReviser method replaceOutliers.

 * Replaces outliers found in the row input according to the selected replacement option. Additionally, the outlier
 * replacement counts and new domains are calculated.
 * @param exec the execution context
 * @param in the row input whose outliers have to be treated
 * @param out the row output whose outliers have been treated
 * @param outlierModel the model storing the permitted intervals
 * @param memberCounter the member counter
 * @param outlierRepCounter the outlier replacement counter
 * @param missingGroupsCounter the missing groups counter
 * @throws Exception any exception to indicate an error, cancelation
private void replaceOutliers(final ExecutionContext exec, final RowInput in, final RowOutput out, final NumericOutliersModel outlierModel, final MemberCounter memberCounter, final MemberCounter outlierRepCounter, final MemberCounter missingGroupsCounter) throws Exception {
    // total number of outlier columns
    final int noOutliers = m_outlierColNames.length;
    // the in table spec
    final DataTableSpec inSpec = in.getDataTableSpec();
    // create column re-arranger to overwrite cells corresponding to outliers
    final ColumnRearranger colRearranger = new ColumnRearranger(inSpec);
    // store the positions where the outlier column names can be found in the input table
    final int[] outlierIndices = calculateOutlierIndicies(inSpec);
    final DataColumnSpec[] outlierSpecs = new DataColumnSpec[noOutliers];
    for (int i = 0; i < noOutliers; i++) {
        outlierSpecs[i] = inSpec.getColumnSpec(outlierIndices[i]);
    // values are copied anyways by the re-arranger so there is no need to
    // create new instances for each row
    final DataCell[] treatedVals = new DataCell[noOutliers];
    final AbstractCellFactory fac = new AbstractCellFactory(true, outlierSpecs) {

        public DataCell[] getCells(final DataRow row) {
            final GroupKey key = outlierModel.getKey(row, inSpec);
            final Map<String, double[]> colsMap = outlierModel.getGroupIntervals(key);
            for (int i = 0; i < noOutliers; i++) {
                final DataCell curCell = row.getCell(outlierIndices[i]);
                final DataCell treatedCell;
                final String outlierColName = m_outlierColNames[i];
                if (!curCell.isMissing()) {
                    // if the key exists treat the value otherwise we process an unkown group
                    if (colsMap != null) {
                        // increment the member counter
                        memberCounter.incrementMemberCount(outlierColName, key);
                        // treat the value of the cell if its a outlier
                        treatedCell = treatCellValue(colsMap.get(outlierColName), curCell);
                    } else {
                        missingGroupsCounter.incrementMemberCount(outlierColName, key);
                        treatedCell = curCell;
                } else {
                    treatedCell = curCell;
                // if we changed the value this is an outlier
                if (!treatedCell.equals(curCell)) {
                    outlierRepCounter.incrementMemberCount(outlierColName, key);
                // update the domain if necessary
                if (m_updateDomain && !treatedCell.isMissing()) {
                    m_domainUpdater.updateDomain(outlierColName, ((DoubleValue) treatedCell).getDoubleValue());
                treatedVals[i] = treatedCell;
            return treatedVals;
    // replace the outlier columns by their updated versions
    colRearranger.replace(fac, outlierIndices);
    // stream it
    colRearranger.createStreamableFunction().runFinal(new PortInput[] { in }, new PortOutput[] { out }, exec);
Also used : DataTableSpec( AbstractCellFactory( GroupKey(org.knime.base.node.preproc.groupby.GroupKey) DataRow( ColumnRearranger( DataColumnSpec( DataCell(

Example 3 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class NumericOutliersIntervalsCalculator method calcPermittedIntervals.

 * Replaces the first and third quartile values stored in the input table by the permitted interval boundaries.
 * columns and interval boundaries.
 * @param exec the execution context
 * @param quartiles the data table holding the groups, and the first and third quartile for each of the outlier
 *            columns
 * @return the outlier model storing the permitted interval
 * @throws CanceledExecutionException if the user has canceled the execution
private NumericOutliersModel calcPermittedIntervals(final ExecutionContext exec, final BufferedDataTable quartiles) throws CanceledExecutionException {
    final DataTableSpec quartilesSpec = quartiles.getDataTableSpec();
    // the group by table does not rename the group columns so we can use this spec, instead of the
    // in table spec as well (if this is changed the quartilesSpec has to be replaced by the inSpec)
    final NumericOutliersModel model = new NumericOutliersModel(m_groupColNames, m_outlierColNames);
    // first position where outlier columns can be found
    final int outlierOffset = m_groupColNames.length;
    // store counters to update the progress
    final long rowCount = quartiles.size();
    long rowCounter = 1;
    for (final DataRow row : quartiles) {
        // 'final' due to access in lambda expression
        final long rowCounterLong = rowCounter++;
        exec.setProgress(rowCounterLong / (double) rowCount, () -> "Storing interval for row " + rowCounterLong + " of " + rowCount);
        // calculate the groups key
        final GroupKey key = model.getKey(row, quartilesSpec);
        for (int i = 0; i < m_outlierColNames.length; i++) {
            // the permitted interval
            final double[] permInterval;
            // index of the outlier column in the quartiles table
            final int index = i * 2 + outlierOffset;
            // the first quartile cell
            DataCell fQuart = row.getCell(index);
            // the third quartile cell
            DataCell tQuart = row.getCell(index + 1);
            // the entire group consists of Missing Values
            if (!fQuart.isMissing() && !tQuart.isMissing()) {
                // value of the first quartile
                final double fQ = ((DoubleValue) fQuart).getDoubleValue();
                // value of the third quartile
                final double tQ = ((DoubleValue) tQuart).getDoubleValue();
                // calculate the scaled IQR
                final double iqr = m_iqrMultiplier * (tQ - fQ);
                // store the interval
                permInterval = new double[] { fQ - iqr, tQ + iqr };
            } else {
                permInterval = null;
            // setting null here is vital and will be treated by the outlier reviser.
            model.addEntry(key, m_outlierColNames[i], permInterval);
    return model;
Also used : DataTableSpec( DoubleValue( GroupKey(org.knime.base.node.preproc.groupby.GroupKey) DataCell( DataRow(

Example 4 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class MemberCounter method saveModel.

 * Saves the member counter to the provided model content.
 * @param model the model content to save to
void saveModel(final ModelContentWO model) {
    int gInd = 0;
    for (Entry<String, Map<GroupKey, Integer>> entry : m_groupCounts.entrySet()) {
        final ModelContentWO colSettings = model.addModelContent(CFG_OUT_COL + gInd++);
        colSettings.addString(CFG_OUT_COL_NAME, entry.getKey());
        final ModelContentWO groupCounts = colSettings.addModelContent(CFG_GROUP_COUNTS);
        int oInd = 0;
        for (Entry<GroupKey, Integer> gCountEntry : entry.getValue().entrySet()) {
            final ModelContentWO groupCount = groupCounts.addModelContent(CFG_GROUP_COUNT + oInd++);
            groupCount.addDataCellArray(CFG_GROUP_KEY, gCountEntry.getKey().getGroupVals());
            groupCount.addInt(CFG_GROUP_VAL, gCountEntry.getValue());
Also used : ModelContentWO(org.knime.core.node.ModelContentWO) GroupKey(org.knime.base.node.preproc.groupby.GroupKey) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 5 with GroupKey

use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.

the class NumericOutliersReviser method treatRows.

 * Removes/Retains all rows from the row input that contain outliers. Additionally, the outlier and group related
 * counts, and the new domains are calculated.
 * @param exec the execution context
 * @param in the row input whose outliers have to be treated
 * @param out the row output whose outliers have been treated
 * @param permIntervalsModel the model storing the permitted intervals
 * @param rowCount the row count of the row input
 * @param memberCounter the member counter
 * @param outlierRepCounter the outlier replacement counter
 * @param missingGroupsCounter the missing groups counter
 * @throws CanceledExecutionException if the user has canceled the execution
 * @throws InterruptedException if canceled
private void treatRows(final ExecutionContext exec, final RowInput in, final RowOutput out, final NumericOutliersModel permIntervalsModel, final long rowCount, final MemberCounter memberCounter, final MemberCounter outlierRepCounter, final MemberCounter missingGroupsCounter) throws CanceledExecutionException, InterruptedException {
    // the in spec
    final DataTableSpec inSpec = in.getDataTableSpec();
    // store the positions where the outlier column names can be found in the input table
    final int[] outlierIndices = calculateOutlierIndicies(inSpec);
    // total number of outlier columns
    final int noOutliers = m_outlierColNames.length;
    final double divisor = rowCount;
    long rowCounter = 1;
    // for each row test if it contains an outlier
    DataRow row;
    while ((row = in.poll()) != null) {
        if (rowCount > 0) {
            // 'final' due to access in lambda expression
            final long rowCounterLong = rowCounter++;
            exec.setProgress(rowCounterLong / divisor, () -> "Testing row " + rowCounterLong + " of " + rowCount + " for outliers");
        // get the group key of the currently processed row
        final GroupKey key = permIntervalsModel.getKey(row, inSpec);
        // get the map holding the permitted intervals for the given groups key
        Map<String, double[]> colsMap = permIntervalsModel.getGroupIntervals(key);
        boolean outlierFreeRow = true;
        for (int i = 0; i < noOutliers; i++) {
            final DataCell cell = row.getCell(outlierIndices[i]);
            final String outlierColName = m_outlierColNames[i];
            // if the key is existent check the rows, otherwise increment the missing group counters
            if (colsMap != null) {
                final double[] interval = colsMap.get(outlierColName);
                if (!cell.isMissing()) {
                    // increment the member counter
                    memberCounter.incrementMemberCount(outlierColName, key);
                    final double val = ((DoubleValue) cell).getDoubleValue();
                    // the model might not have learned anything about this key - outlier column combination
                    if (interval != null && isOutlier(interval, val)) {
                        outlierFreeRow = false;
                        // increment the outlier counter
                        outlierRepCounter.incrementMemberCount(outlierColName, key);
            } else {
                if (!cell.isMissing()) {
                    missingGroupsCounter.incrementMemberCount(outlierColName, key);
        if ((outlierFreeRow && m_treatment == NumericOutliersTreatmentOption.FILTER) || (!outlierFreeRow && m_treatment == NumericOutliersTreatmentOption.RETAIN)) {
            // update the domain if necessary
            if (m_updateDomain) {
                DataCell cell;
                for (int i = 0; i < noOutliers; i++) {
                    if (!(cell = row.getCell(outlierIndices[i])).isMissing()) {
                        m_domainUpdater.updateDomain(m_outlierColNames[i], ((DoubleValue) cell).getDoubleValue());
Also used : DataTableSpec( GroupKey(org.knime.base.node.preproc.groupby.GroupKey) DataRow( DoubleValue( DataCell(


GroupKey (org.knime.base.node.preproc.groupby.GroupKey)7 DataCell ( DataRow ( DataTableSpec ( HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2 Map (java.util.Map)2 DoubleValue ( ModelContentRO (org.knime.core.node.ModelContentRO)2 ModelContentWO (org.knime.core.node.ModelContentWO)2 DataColumnSpec ( AbstractCellFactory ( ColumnRearranger (