use of org.knime.base.node.preproc.groupby.GroupByTable in project knime-core by knime.
the class NumericOutliersIntervalsCalculator method calculatePermittedIntervals.
/**
* Calculates the permitted intervals.
*
* @param inTable the data table for which the outliers have to be detected
* @param exec the execution context
* @return returns the mapping between groups and the permitted intervals for each outlier column
* @throws Exception if the execution failed, due to internal reasons or cancelation from the outside
*/
NumericOutliersModel calculatePermittedIntervals(final BufferedDataTable inTable, final ExecutionContext exec) throws Exception {
// the quartile calculation progress
final double quartilesProgress = 0.8;
// the interval calculation progress
final double intervalsProgress = 1 - quartilesProgress;
// start the computation of the first and third quartile (and some additional stuff)
exec.setMessage(STATISTICS_MSG);
GroupByTable t;
// out of memory exception while initializing/cloning the aggregators. However, this is very unlikely
try {
ExecutionContext quartilesCalcExec = exec.createSubExecutionContext(quartilesProgress);
t = getGroupByTable(inTable, quartilesCalcExec);
quartilesCalcExec.setProgress(1.0);
} catch (final OutOfMemoryError e) {
throw new IllegalArgumentException(MEMORY_EXCEPTION, e);
}
// carried out inside the memory
if (!t.getSkippedGroupsByColName().isEmpty()) {
throw new IllegalArgumentException(MEMORY_EXCEPTION);
}
// start the permitted interval calculation
exec.setMessage(INTERVAL_MSG);
// interval subexecution context
ExecutionContext intervalExec = exec.createSubExecutionContext(intervalsProgress);
// calculate the permitted intervals and store them to the model
final NumericOutliersModel model = calcPermittedIntervals(intervalExec, t.getBufferedTable());
// update the progress and return the permitted intervals
exec.setProgress(1);
// return the model
return model;
}
use of org.knime.base.node.preproc.groupby.GroupByTable in project knime-core by knime.
the class CrosstabNodeModel method createGroupByTable.
/**
* Create group-by table.
* @param exec execution context
* @param table input table to group
* @param groupByCols column selected for group-by operation
* @return table with group and aggregation columns
* @throws CanceledExecutionException if the group-by table generation was
* canceled externally
*/
private final GroupByTable createGroupByTable(final ExecutionContext exec, final BufferedDataTable table, final List<String> groupByCols) throws CanceledExecutionException {
final int maxUniqueVals = Integer.MAX_VALUE;
final boolean enableHilite = m_settings.getEnableHiliting();
final boolean retainOrder = false;
final ColumnNamePolicy colNamePolicy = ColumnNamePolicy.AGGREGATION_METHOD_COLUMN_NAME;
final GlobalSettings globalSettings = GlobalSettings.builder().setFileStoreFactory(FileStoreFactory.createWorkflowFileStoreFactory(exec)).setGroupColNames(groupByCols).setMaxUniqueValues(maxUniqueVals).setValueDelimiter(GlobalSettings.STANDARD_DELIMITER).setDataTableSpec(table.getDataTableSpec()).setNoOfRows(table.size()).setAggregationContext(AggregationContext.ROW_AGGREGATION).build();
ColumnAggregator collAggregator = null;
if (null != m_settings.getWeightColumn()) {
final String weightColumn = m_settings.getWeightColumn();
// the column aggregator for the weighting column
final boolean inclMissing = false;
final DataColumnSpec originalColSpec = table.getDataTableSpec().getColumnSpec(weightColumn);
final OperatorColumnSettings opColSettings = new OperatorColumnSettings(inclMissing, originalColSpec);
collAggregator = new ColumnAggregator(originalColSpec, new NonNegativeSumOperator(globalSettings, opColSettings), inclMissing);
} else {
// use any column, does not matter as long as it exists and
// include missing is true;
final boolean inclMissing = true;
final DataColumnSpec originalColSpec = table.getDataTableSpec().getColumnSpec(groupByCols.get(0));
final OperatorColumnSettings opColSettings = new OperatorColumnSettings(inclMissing, originalColSpec);
collAggregator = new ColumnAggregator(originalColSpec, new CountOperator(globalSettings, opColSettings), inclMissing);
}
final GroupByTable resultTable = new BigGroupByTable(exec, table, groupByCols, new ColumnAggregator[] { collAggregator }, globalSettings, enableHilite, colNamePolicy, retainOrder);
if (enableHilite) {
setHiliteMapping(new DefaultHiLiteMapper(resultTable.getHiliteMapping()));
}
// check for skipped columns
final String warningMsg = resultTable.getSkippedGroupsMessage(3, 3);
if (warningMsg != null) {
setWarningMessage(warningMsg);
}
return resultTable;
}
use of org.knime.base.node.preproc.groupby.GroupByTable in project knime-core by knime.
the class Pivot2NodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
final BufferedDataTable table = (BufferedDataTable) inData[0];
final List<String> groupAndPivotCols = createAllColumns();
final BufferedDataTable groupTable;
final String orderPivotColumnName;
ExecutionContext groupAndPivotExec = exec.createSubExecutionContext(0.5);
ExecutionContext groupExec = exec.createSubExecutionContext(0.25);
ExecutionContext pivotExec = exec.createSubExecutionContext(0.25);
double progMainTotal = 0.0;
double progMainTableAppendIndexForSort = isProcessInMemory() || isRetainOrder() ? 1.0 : 0.0;
progMainTotal += progMainTableAppendIndexForSort;
double progMainTableGroup = 5.0;
progMainTotal += progMainTableGroup;
double progMainTableInMemSort = isProcessInMemory() ? 3.0 : 0.0;
progMainTotal += progMainTableInMemSort;
double progMainTableGetPivots = 1.0;
progMainTotal += progMainTableGetPivots;
double progMainTableFillPivots = 1.0;
progMainTotal += progMainTableFillPivots;
double progMainTableRestoreSort = isProcessInMemory() || isRetainOrder() ? 1.0 : 0.0;
progMainTotal += progMainTableRestoreSort;
double progMainTableReplaceRowKey = isProcessInMemory() ? 1.0 : 0.0;
progMainTotal += progMainTableReplaceRowKey;
if (isProcessInMemory() || isRetainOrder()) {
exec.setMessage("Keeping row order");
final String retainOrderCol = DataTableSpec.getUniqueColumnName(table.getDataTableSpec(), "#pivot_order#");
// append temp. id column with minimum-aggregation method
final ColumnAggregator[] colAggregators = getColumnAggregators().toArray(new ColumnAggregator[0]);
final Set<String> workingCols = new LinkedHashSet<String>();
workingCols.addAll(groupAndPivotCols);
for (final ColumnAggregator ca : colAggregators) {
workingCols.add(ca.getOriginalColName());
}
workingCols.add(retainOrderCol);
final BufferedDataTable appTable = GroupByTable.appendOrderColumn(groupAndPivotExec.createSubExecutionContext(progMainTableAppendIndexForSort / progMainTotal), table, workingCols, retainOrderCol);
final DataColumnSpec retainOrderColSpec = appTable.getSpec().getColumnSpec(retainOrderCol);
final ColumnAggregator[] aggrs = new ColumnAggregator[colAggregators.length + 1];
System.arraycopy(colAggregators, 0, aggrs, 0, colAggregators.length);
aggrs[colAggregators.length] = new ColumnAggregator(retainOrderColSpec, AggregationMethods.getRowOrderMethod(), true);
orderPivotColumnName = getColumnNamePolicy().createColumName(aggrs[colAggregators.length]);
exec.setMessage("Grouping main table");
final GroupByTable groupByTable = createGroupByTable(groupAndPivotExec.createSubExecutionContext(progMainTableGroup / progMainTotal), appTable, groupAndPivotCols, isProcessInMemory(), false, /* retain order always false; handled by pivoting */
Arrays.asList(aggrs));
// true then sort table by group&pivot columns
if (isProcessInMemory()) {
exec.setMessage("Sorting group table");
final boolean[] sortDirection = new boolean[groupAndPivotCols.size()];
// ensure that missing values are at the end by sorting in ascending order
Arrays.fill(sortDirection, true);
final SortedTable sortedGroupByTable = new SortedTable(groupByTable.getBufferedTable(), groupAndPivotCols, sortDirection, groupAndPivotExec.createSubExecutionContext(progMainTableInMemSort / progMainTotal));
groupTable = sortedGroupByTable.getBufferedDataTable();
} else {
groupTable = groupByTable.getBufferedTable();
}
} else {
exec.setMessage("Grouping main table");
final GroupByTable groupByTable = createGroupByTable(groupAndPivotExec.createSubExecutionContext(progMainTableGroup / progMainTotal), table, groupAndPivotCols, isProcessInMemory(), false, getColumnAggregators());
groupTable = groupByTable.getBufferedTable();
orderPivotColumnName = null;
}
final List<String> pivotCols = m_pivotCols.getIncludeList();
final int[] pivotIdx = new int[pivotCols.size()];
final DataTableSpec groupSpec = groupTable.getSpec();
final Set<String>[] combPivots = createCombinedPivots(groupSpec, pivotCols);
for (int i = 0; i < pivotIdx.length; i++) {
pivotIdx[i] = groupSpec.findColumnIndex(pivotCols.get(i));
}
exec.setProgress("Determining pivots...");
ExecutionContext fillExec = groupAndPivotExec.createSubExecutionContext(progMainTableGetPivots / progMainTotal);
final long groupTableSize = groupTable.size();
long groupIndex = 0;
for (final DataRow row : groupTable) {
for (int i = 0; i < pivotIdx.length; i++) {
if (combPivots[i] == null) {
combPivots[i] = new LinkedHashSet<String>();
}
final DataCell cell = row.getCell(pivotIdx[i]);
if (cell.isMissing()) {
if (!m_ignoreMissValues.getBooleanValue()) {
combPivots[i].add(cell.toString());
}
} else {
combPivots[i].add(cell.toString());
}
}
fillExec.setProgress(groupIndex++ / (double) groupTableSize, String.format("Group \"%s\" (%d/%d)", row.getKey(), groupIndex, groupTableSize));
fillExec.checkCanceled();
}
final Map<String, Integer> pivotStarts = new LinkedHashMap<String, Integer>();
final DataTableSpec outSpec = createOutSpec(groupSpec, combPivots, pivotStarts, orderPivotColumnName);
exec.setProgress("Filling pivot table");
BufferedDataTable pivotTable = fillPivotTable(groupTable, outSpec, pivotStarts, groupAndPivotExec.createSubExecutionContext(progMainTableFillPivots / progMainTotal), orderPivotColumnName);
if (orderPivotColumnName != null) {
exec.setMessage("Restoring row order");
final SortedTable sortedPivotTable = new SortedTable(pivotTable, Arrays.asList(new String[] { orderPivotColumnName }), new boolean[] { true }, groupAndPivotExec.createSubExecutionContext(progMainTableRestoreSort / progMainTotal));
pivotTable = sortedPivotTable.getBufferedDataTable();
final ColumnRearranger colre = new ColumnRearranger(pivotTable.getSpec());
colre.remove(orderPivotColumnName);
pivotTable = exec.createColumnRearrangeTable(pivotTable, colre, exec.createSilentSubProgress(0.0));
}
// temp fix for bug 3286
if (isProcessInMemory()) {
// if process in memory is true, RowKey's needs to be re-computed
final BufferedDataContainer rowkeyBuf = groupAndPivotExec.createSubExecutionContext(progMainTableReplaceRowKey / progMainTotal).createDataContainer(pivotTable.getSpec());
long rowIndex = 0;
for (DataRow row : pivotTable) {
rowkeyBuf.addRowToTable(new DefaultRow(RowKey.createRowKey(rowIndex++), row));
}
rowkeyBuf.close();
pivotTable = rowkeyBuf.getTable();
}
groupAndPivotExec.setProgress(1.0);
/* Fill the 3rd port */
exec.setMessage("Determining pivot totals");
double progPivotTotal = 0.0;
double progPivotGroup = 5.0;
progPivotTotal += progPivotGroup;
double progPivotFillMissing = 1.0;
progPivotTotal += progPivotFillMissing;
double progPivotFillPivots = 1.0;
progPivotTotal += progPivotFillPivots;
double progPivotOverallTotals = m_totalAggregation.getBooleanValue() ? 5.0 : 0.0;
progPivotTotal += progPivotOverallTotals;
// create pivot table only on pivot columns (for grouping)
// perform pivoting: result in single line
final GroupByTable rowGroup = createGroupByTable(pivotExec.createSubExecutionContext(progPivotGroup / progPivotTotal), table, m_pivotCols.getIncludeList(), isProcessInMemory(), isRetainOrder(), getColumnAggregators());
final BufferedDataTable rowGroupTable = rowGroup.getBufferedTable();
// fill group columns with missing cells
final ColumnRearranger colre = new ColumnRearranger(rowGroupTable.getDataTableSpec());
for (int i = 0; i < getGroupByColumns().size(); i++) {
final DataColumnSpec cspec = outSpec.getColumnSpec(i);
final CellFactory factory = new SingleCellFactory(cspec) {
/**
* {@inheritDoc}
*/
@Override
public DataCell getCell(final DataRow row) {
return DataType.getMissingCell();
}
};
colre.insertAt(i, factory);
}
final BufferedDataTable groupedRowTable = exec.createColumnRearrangeTable(rowGroupTable, colre, pivotExec.createSubExecutionContext(progPivotFillMissing / progPivotTotal));
BufferedDataTable pivotRowsTable = fillPivotTable(groupedRowTable, outSpec, pivotStarts, pivotExec.createSubExecutionContext(progPivotFillPivots / progPivotTotal), null);
if (orderPivotColumnName != null) {
final ColumnRearranger colre2 = new ColumnRearranger(pivotRowsTable.getSpec());
colre2.remove(orderPivotColumnName);
pivotRowsTable = exec.createColumnRearrangeTable(pivotRowsTable, colre2, exec.createSilentSubProgress(0.0));
}
// total aggregation without grouping
if (m_totalAggregation.getBooleanValue()) {
@SuppressWarnings("unchecked") final GroupByTable totalGroup = createGroupByTable(pivotExec.createSubExecutionContext(progPivotOverallTotals / progPivotTotal), table, Collections.EMPTY_LIST, isProcessInMemory(), isRetainOrder(), getColumnAggregators());
final BufferedDataTable totalGroupTable = totalGroup.getBufferedTable();
final DataTableSpec pivotsRowsSpec = pivotRowsTable.getSpec();
final DataTableSpec totalGroupSpec = totalGroupTable.getSpec();
final DataTableSpec overallTotalSpec = new DataTableSpec(pivotsRowsSpec, totalGroupSpec);
final BufferedDataContainer buf = exec.createDataContainer(overallTotalSpec);
if (pivotRowsTable.size() > 0) {
final List<DataCell> pivotTotalsCells = new ArrayList<DataCell>();
final DataRow pivotsRow = pivotRowsTable.iterator().next();
for (final DataCell cell : pivotsRow) {
pivotTotalsCells.add(cell);
}
final DataRow totalGroupRow = totalGroupTable.iterator().next();
for (final DataCell cell : totalGroupRow) {
pivotTotalsCells.add(cell);
}
buf.addRowToTable(new DefaultRow(new RowKey("Totals"), pivotTotalsCells));
}
buf.close();
pivotRowsTable = buf.getTable();
}
pivotExec.setProgress(1.0);
/* Fill the 2nd port: important to create this last since it will create
* the final hilite handler (mapping) for port #1 AND #2 (bug 3270) */
exec.setMessage("Creating group totals");
// create group table only on group columns; no pivoting
final BufferedDataTable columnGroupTable = createGroupByTable(groupExec, table, getGroupByColumns()).getBufferedTable();
return new PortObject[] { // pivot table
pivotTable, // group totals
columnGroupTable, // pivot and overall totals
pivotRowsTable };
}
use of org.knime.base.node.preproc.groupby.GroupByTable in project knime-core by knime.
the class NumericOutliersIntervalsCalculator method getGroupByTable.
/**
* Constructs the group by table in accordance with the given settings.
*
* @param inTable the input data table
* @param exec the execution context
* @return the group by table w.r.t. the selected settings
* @throws CanceledExecutionException if the user has canceled the execution
*/
private GroupByTable getGroupByTable(final BufferedDataTable inTable, final ExecutionContext exec) throws CanceledExecutionException {
// get the global settings
final GlobalSettings gSettings = getGlobalSettings(inTable);
// create the column aggregators
final ColumnAggregator[] agg = getAggretators(inTable.getDataTableSpec(), gSettings);
// init and return the GroupByTable obeying the chosen memory settings
final GroupByTable t;
if (m_inMemory) {
t = new MemoryGroupByTable(exec, inTable, Arrays.stream(m_groupColNames).collect(Collectors.toList()), agg, gSettings, false, COLUMN_NAME_POLICY, false);
} else {
t = new BigGroupByTable(exec, inTable, Arrays.stream(m_groupColNames).collect(Collectors.toList()), agg, gSettings, false, COLUMN_NAME_POLICY, false);
}
return t;
}
use of org.knime.base.node.preproc.groupby.GroupByTable in project knime-core by knime.
the class CrosstabNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
final BufferedDataTable table = inData[0];
final List<String> cols = Arrays.asList(new String[] { m_settings.getRowVarColumn(), m_settings.getColVarColumn() });
final GroupByTable groupTable = createGroupByTable(exec.createSubExecutionContext(0.6), table, cols);
final BufferedDataTable freqTable = groupTable.getBufferedTable();
// the index of the row variable in the group table
final int rowVarI = 0;
// the index of the column variable in the group table
final int colVarI = 1;
// the index of the frequency in the group table
final int freqI = 2;
final CrosstabTotals totals = computeTotals(freqTable, rowVarI, colVarI, freqI);
final CrosstabProperties naming = CrosstabProperties.create(m_settings.getNamingVersion());
final CrosstabStatisticsCalculator stats = new CrosstabStatisticsCalculator(freqTable, rowVarI, colVarI, freqI, totals, naming);
stats.run(exec.createSubExecutionContext(0.1));
final BufferedDataTable propsTable = stats.getTable();
final int cellChiSquareI = propsTable.getDataTableSpec().findColumnIndex(naming.getCellChiSquareName());
// create output table
final BufferedDataContainer cont = exec.createDataContainer(createOutSpec(table.getSpec()));
final RowIterator freqIter = freqTable.iterator();
final RowIterator statsIter = propsTable.iterator();
final Map<String, Integer> props = new LinkedHashMap<String, Integer>();
for (int i = 0; i < m_settings.getProperties().size(); i++) {
final String prop = m_settings.getProperties().get(i);
props.put(prop, i + 2);
}
for (long i = 0; i < freqTable.size(); i++) {
final DataCell[] cells = new DataCell[props.size() + 2];
final DataRow freqRow = freqIter.next();
// add the row variable
final DataCell rowVar = freqRow.getCell(rowVarI);
cells[0] = rowVar;
// add the column variable
final DataCell colVar = freqRow.getCell(colVarI);
cells[1] = colVar;
// the frequency
final DataCell freqCell = freqRow.getCell(freqI);
final double freq = freqCell.isMissing() ? 0.0 : ((DoubleValue) freqCell).getDoubleValue();
addToCells(cells, props, naming.getFrequencyName(), new DoubleCell(freq));
// the cell chi-square
final DataRow statsRow = statsIter.next();
addToCells(cells, props, naming.getCellChiSquareName(), statsRow.getCell(cellChiSquareI));
// the total
final double total = totals.getTotal();
addToCells(cells, props, naming.getTotalCountName(), new DoubleCell(total));
// the rowTotal
final double rowTotal = totals.getRowTotal().get(rowVar);
addToCells(cells, props, naming.getTotalRowCountName(), new DoubleCell(rowTotal));
// the column Total
final double colTotal = totals.getColTotal().get(colVar);
addToCells(cells, props, naming.getTotalColCountName(), new DoubleCell(colTotal));
// the percent = frequency / total
final double percent = 100 * (freq / total);
addToCells(cells, props, naming.getPercentName(), new DoubleCell(percent));
// the row percent
final double rowPercent = 0.0 == freq ? 0.0 : 100.0 * (freq / rowTotal);
addToCells(cells, props, naming.getRowPercentName(), new DoubleCell(rowPercent));
// the col percent
final double colPercent = 0.0 == freq ? 0.0 : 100.0 * (freq / colTotal);
addToCells(cells, props, naming.getColPercentName(), new DoubleCell(colPercent));
// the expected frequency
final double expected = 0.0 == total ? 0.0 : colTotal / total * rowTotal;
addToCells(cells, props, naming.getExpectedFrequencyName(), new DoubleCell(expected));
// the deviation (the difference of the frequency to the
// expected frequency)
final double deviation = freq - expected;
addToCells(cells, props, naming.getDeviationName(), new DoubleCell(deviation));
final DefaultRow row = new DefaultRow(RowKey.createRowKey(i), cells);
cont.addRowToTable(row);
}
cont.close();
m_outTable = cont.getTable();
m_statistics = stats.getStatistics();
m_statOutTable = stats.getStatistics().getTable();
m_totals = totals;
return new BufferedDataTable[] { m_outTable, m_statOutTable };
}
Aggregations