use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.
the class MemberCounter method loadInstance.
/**
* Load a member counter from the provided model content.
*
* @param model the model content
* @return the proper initialized member counter
* @throws InvalidSettingsException if the input settings cannot be parsed
*/
@SuppressWarnings("unchecked")
static MemberCounter loadInstance(final ModelContentRO model) throws InvalidSettingsException {
// init the counter
final MemberCounter counter = new MemberCounter();
// load all the data
final Enumeration<ModelContentRO> colSettings = model.children();
while (colSettings.hasMoreElements()) {
final ModelContentRO colSetting = colSettings.nextElement();
final String outlierColName = colSetting.getString(CFG_OUT_COL_NAME);
final Enumeration<ModelContentRO> groupCounts = colSetting.getModelContent(CFG_GROUP_COUNTS).children();
while (groupCounts.hasMoreElements()) {
final ModelContentRO groupCount = groupCounts.nextElement();
final GroupKey key = new GroupKey(groupCount.getDataCellArray(CFG_GROUP_KEY));
final int count = groupCount.getInt(CFG_GROUP_VAL);
counter.incrementMemberCount(outlierColName, key, count);
}
}
// return the counter
return counter;
}
use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.
the class NumericOutliersReviser method replaceOutliers.
/**
* Replaces outliers found in the row input according to the selected replacement option. Additionally, the outlier
* replacement counts and new domains are calculated.
*
* @param exec the execution context
* @param in the row input whose outliers have to be treated
* @param out the row output whose outliers have been treated
* @param outlierModel the model storing the permitted intervals
* @param memberCounter the member counter
* @param outlierRepCounter the outlier replacement counter
* @param missingGroupsCounter the missing groups counter
* @throws Exception any exception to indicate an error, cancelation
*/
private void replaceOutliers(final ExecutionContext exec, final RowInput in, final RowOutput out, final NumericOutliersModel outlierModel, final MemberCounter memberCounter, final MemberCounter outlierRepCounter, final MemberCounter missingGroupsCounter) throws Exception {
// total number of outlier columns
final int noOutliers = m_outlierColNames.length;
// the in table spec
final DataTableSpec inSpec = in.getDataTableSpec();
// create column re-arranger to overwrite cells corresponding to outliers
final ColumnRearranger colRearranger = new ColumnRearranger(inSpec);
// store the positions where the outlier column names can be found in the input table
final int[] outlierIndices = calculateOutlierIndicies(inSpec);
final DataColumnSpec[] outlierSpecs = new DataColumnSpec[noOutliers];
for (int i = 0; i < noOutliers; i++) {
outlierSpecs[i] = inSpec.getColumnSpec(outlierIndices[i]);
}
// values are copied anyways by the re-arranger so there is no need to
// create new instances for each row
final DataCell[] treatedVals = new DataCell[noOutliers];
final AbstractCellFactory fac = new AbstractCellFactory(true, outlierSpecs) {
@Override
public DataCell[] getCells(final DataRow row) {
final GroupKey key = outlierModel.getKey(row, inSpec);
final Map<String, double[]> colsMap = outlierModel.getGroupIntervals(key);
for (int i = 0; i < noOutliers; i++) {
final DataCell curCell = row.getCell(outlierIndices[i]);
final DataCell treatedCell;
final String outlierColName = m_outlierColNames[i];
if (!curCell.isMissing()) {
// if the key exists treat the value otherwise we process an unkown group
if (colsMap != null) {
// increment the member counter
memberCounter.incrementMemberCount(outlierColName, key);
// treat the value of the cell if its a outlier
treatedCell = treatCellValue(colsMap.get(outlierColName), curCell);
} else {
missingGroupsCounter.incrementMemberCount(outlierColName, key);
treatedCell = curCell;
}
} else {
treatedCell = curCell;
}
// if we changed the value this is an outlier
if (!treatedCell.equals(curCell)) {
outlierRepCounter.incrementMemberCount(outlierColName, key);
}
// update the domain if necessary
if (m_updateDomain && !treatedCell.isMissing()) {
m_domainUpdater.updateDomain(outlierColName, ((DoubleValue) treatedCell).getDoubleValue());
}
treatedVals[i] = treatedCell;
}
return treatedVals;
}
};
// replace the outlier columns by their updated versions
colRearranger.replace(fac, outlierIndices);
// stream it
colRearranger.createStreamableFunction().runFinal(new PortInput[] { in }, new PortOutput[] { out }, exec);
exec.setProgress(1);
}
use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.
the class NumericOutliersIntervalsCalculator method calcPermittedIntervals.
/**
* Replaces the first and third quartile values stored in the input table by the permitted interval boundaries.
* columns and interval boundaries.
*
* @param exec the execution context
* @param quartiles the data table holding the groups, and the first and third quartile for each of the outlier
* columns
* @return the outlier model storing the permitted interval
* @throws CanceledExecutionException if the user has canceled the execution
*/
private NumericOutliersModel calcPermittedIntervals(final ExecutionContext exec, final BufferedDataTable quartiles) throws CanceledExecutionException {
final DataTableSpec quartilesSpec = quartiles.getDataTableSpec();
// the group by table does not rename the group columns so we can use this spec, instead of the
// in table spec as well (if this is changed the quartilesSpec has to be replaced by the inSpec)
final NumericOutliersModel model = new NumericOutliersModel(m_groupColNames, m_outlierColNames);
// first position where outlier columns can be found
final int outlierOffset = m_groupColNames.length;
// store counters to update the progress
final long rowCount = quartiles.size();
long rowCounter = 1;
for (final DataRow row : quartiles) {
exec.checkCanceled();
// 'final' due to access in lambda expression
final long rowCounterLong = rowCounter++;
exec.setProgress(rowCounterLong / (double) rowCount, () -> "Storing interval for row " + rowCounterLong + " of " + rowCount);
// calculate the groups key
final GroupKey key = model.getKey(row, quartilesSpec);
for (int i = 0; i < m_outlierColNames.length; i++) {
// the permitted interval
final double[] permInterval;
// index of the outlier column in the quartiles table
final int index = i * 2 + outlierOffset;
// the first quartile cell
DataCell fQuart = row.getCell(index);
// the third quartile cell
DataCell tQuart = row.getCell(index + 1);
// the entire group consists of Missing Values
if (!fQuart.isMissing() && !tQuart.isMissing()) {
// value of the first quartile
final double fQ = ((DoubleValue) fQuart).getDoubleValue();
// value of the third quartile
final double tQ = ((DoubleValue) tQuart).getDoubleValue();
// calculate the scaled IQR
final double iqr = m_iqrMultiplier * (tQ - fQ);
// store the interval
permInterval = new double[] { fQ - iqr, tQ + iqr };
} else {
permInterval = null;
}
// setting null here is vital and will be treated by the outlier reviser.
model.addEntry(key, m_outlierColNames[i], permInterval);
}
}
return model;
}
use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.
the class MemberCounter method saveModel.
/**
* Saves the member counter to the provided model content.
*
* @param model the model content to save to
*/
void saveModel(final ModelContentWO model) {
int gInd = 0;
for (Entry<String, Map<GroupKey, Integer>> entry : m_groupCounts.entrySet()) {
final ModelContentWO colSettings = model.addModelContent(CFG_OUT_COL + gInd++);
colSettings.addString(CFG_OUT_COL_NAME, entry.getKey());
final ModelContentWO groupCounts = colSettings.addModelContent(CFG_GROUP_COUNTS);
int oInd = 0;
for (Entry<GroupKey, Integer> gCountEntry : entry.getValue().entrySet()) {
final ModelContentWO groupCount = groupCounts.addModelContent(CFG_GROUP_COUNT + oInd++);
groupCount.addDataCellArray(CFG_GROUP_KEY, gCountEntry.getKey().getGroupVals());
groupCount.addInt(CFG_GROUP_VAL, gCountEntry.getValue());
}
}
}
use of org.knime.base.node.preproc.groupby.GroupKey in project knime-core by knime.
the class NumericOutliersReviser method treatRows.
/**
* Removes/Retains all rows from the row input that contain outliers. Additionally, the outlier and group related
* counts, and the new domains are calculated.
*
* @param exec the execution context
* @param in the row input whose outliers have to be treated
* @param out the row output whose outliers have been treated
* @param permIntervalsModel the model storing the permitted intervals
* @param rowCount the row count of the row input
* @param memberCounter the member counter
* @param outlierRepCounter the outlier replacement counter
* @param missingGroupsCounter the missing groups counter
* @throws CanceledExecutionException if the user has canceled the execution
* @throws InterruptedException if canceled
*/
private void treatRows(final ExecutionContext exec, final RowInput in, final RowOutput out, final NumericOutliersModel permIntervalsModel, final long rowCount, final MemberCounter memberCounter, final MemberCounter outlierRepCounter, final MemberCounter missingGroupsCounter) throws CanceledExecutionException, InterruptedException {
// the in spec
final DataTableSpec inSpec = in.getDataTableSpec();
// store the positions where the outlier column names can be found in the input table
final int[] outlierIndices = calculateOutlierIndicies(inSpec);
// total number of outlier columns
final int noOutliers = m_outlierColNames.length;
final double divisor = rowCount;
long rowCounter = 1;
// for each row test if it contains an outlier
DataRow row;
while ((row = in.poll()) != null) {
exec.checkCanceled();
if (rowCount > 0) {
// 'final' due to access in lambda expression
final long rowCounterLong = rowCounter++;
exec.setProgress(rowCounterLong / divisor, () -> "Testing row " + rowCounterLong + " of " + rowCount + " for outliers");
}
// get the group key of the currently processed row
final GroupKey key = permIntervalsModel.getKey(row, inSpec);
// get the map holding the permitted intervals for the given groups key
Map<String, double[]> colsMap = permIntervalsModel.getGroupIntervals(key);
boolean outlierFreeRow = true;
for (int i = 0; i < noOutliers; i++) {
final DataCell cell = row.getCell(outlierIndices[i]);
final String outlierColName = m_outlierColNames[i];
// if the key is existent check the rows, otherwise increment the missing group counters
if (colsMap != null) {
final double[] interval = colsMap.get(outlierColName);
if (!cell.isMissing()) {
// increment the member counter
memberCounter.incrementMemberCount(outlierColName, key);
final double val = ((DoubleValue) cell).getDoubleValue();
// the model might not have learned anything about this key - outlier column combination
if (interval != null && isOutlier(interval, val)) {
outlierFreeRow = false;
// increment the outlier counter
outlierRepCounter.incrementMemberCount(outlierColName, key);
}
}
} else {
if (!cell.isMissing()) {
missingGroupsCounter.incrementMemberCount(outlierColName, key);
}
}
}
if ((outlierFreeRow && m_treatment == NumericOutliersTreatmentOption.FILTER) || (!outlierFreeRow && m_treatment == NumericOutliersTreatmentOption.RETAIN)) {
out.push(row);
// update the domain if necessary
if (m_updateDomain) {
DataCell cell;
for (int i = 0; i < noOutliers; i++) {
if (!(cell = row.getCell(outlierIndices[i])).isMissing()) {
m_domainUpdater.updateDomain(m_outlierColNames[i], ((DoubleValue) cell).getDoubleValue());
}
}
}
}
}
out.close();
}
Aggregations