use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class AutoBinner method execute.
/**
* Determine bins.
*
* @param data the input data
* @param exec the execution context
* @return the operation with the discretisation information
* @throws Exception ...
*/
public PMMLPreprocDiscretize execute(final BufferedDataTable data, final ExecutionContext exec) throws Exception {
final DataTableSpec spec = data.getDataTableSpec();
// determine intervals
if (m_settings.getMethod().equals(Method.fixedNumber)) {
if (m_settings.getEqualityMethod().equals(EqualityMethod.width)) {
BufferedDataTable inData = calcDomainBoundsIfNeccessary(data, exec.createSubExecutionContext(0.9), Arrays.asList(m_included));
init(inData.getDataTableSpec());
Map<String, double[]> edgesMap = new HashMap<String, double[]>();
for (String target : m_included) {
DataTableSpec inSpec = inData.getDataTableSpec();
DataColumnSpec targetCol = inSpec.getColumnSpec(target);
// bounds of the domain
double min = ((DoubleValue) targetCol.getDomain().getLowerBound()).getDoubleValue();
double max = ((DoubleValue) targetCol.getDomain().getUpperBound()).getDoubleValue();
// the edges of the bins
int binCount = m_settings.getBinCount();
double[] edges = calculateBounds(binCount, min, max);
if (m_settings.getIntegerBounds()) {
edges = toIntegerBoundaries(edges);
}
edgesMap.put(target, edges);
}
return createDisretizeOp(edgesMap);
} else {
// EqualityMethod.equalCount
Map<String, double[]> edgesMap = new HashMap<String, double[]>();
for (String target : m_included) {
int colIndex = data.getDataTableSpec().findColumnIndex(target);
List<Double> values = new ArrayList<Double>();
for (DataRow row : data) {
if (!row.getCell(colIndex).isMissing()) {
values.add(((DoubleValue) row.getCell(colIndex)).getDoubleValue());
}
}
edgesMap.put(target, findEdgesForEqualCount(values, m_settings.getBinCount()));
}
return createDisretizeOp(edgesMap);
}
} else if (m_settings.getMethod().equals(Method.sampleQuantiles)) {
init(spec);
Map<String, double[]> edgesMap = new LinkedHashMap<String, double[]>();
final int colCount = m_included.length;
// contains all numeric columns if include all is set!
for (String target : m_included) {
exec.setMessage("Calculating quantiles (column \"" + target + "\")");
ExecutionContext colSortContext = exec.createSubExecutionContext(0.7 / colCount);
ExecutionContext colCalcContext = exec.createSubExecutionContext(0.3 / colCount);
ColumnRearranger singleRearranger = new ColumnRearranger(spec);
singleRearranger.keepOnly(target);
BufferedDataTable singleColSorted = colSortContext.createColumnRearrangeTable(data, singleRearranger, colSortContext);
SortedTable sorted = new SortedTable(singleColSorted, Collections.singletonList(target), new boolean[] { true }, colSortContext);
colSortContext.setProgress(1.0);
double[] edges = createEdgesFromQuantiles(sorted.getBufferedDataTable(), colCalcContext, m_settings.getSampleQuantiles());
colCalcContext.setProgress(1.0);
exec.clearTable(singleColSorted);
if (m_settings.getIntegerBounds()) {
edges = toIntegerBoundaries(edges);
}
edgesMap.put(target, edges);
}
return createDisretizeOp(edgesMap);
} else {
throw new IllegalStateException("Unknown binning method.");
}
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class CAIMDiscretizationNodeModel method createAllIntervalBoundaries.
/**
* Sorts the data table in ascending order on the given column, then all
* distinct values are determined and finally a new table is created that
* holds the minimum, the maximum value and the midpoints of all adjacent
* values. These represent all possible boundaries.
*
* @param table the table with the data
* @param columnIndex the column of interest
* @param exec the execution context to set the progress
*/
private BoundaryScheme createAllIntervalBoundaries(final BufferedDataTable table, final int columnIndex, final ExecutionContext exec) throws Exception {
// sort the data according to the column index
List<String> sortColumn = new ArrayList<String>();
sortColumn.add(table.getDataTableSpec().getColumnSpec(columnIndex).getName());
// according to the class column
if (m_reducedBoundaries) {
sortColumn.add(m_classColumnName.getStringValue());
}
// in ascending order
// in case the class column is not used as second sort criteria
// the sort order of field 2 is ignored
boolean[] sortOrder = new boolean[sortColumn.size()];
Arrays.fill(sortOrder, true);
SortedTable sortedTable = new SortedTable(table, sortColumn, sortOrder, m_sortInMemory.getBooleanValue(), exec);
// the first different value is the minimum value of the sorted list
RowIterator rowIterator = sortedTable.iterator();
// get the first valid value (non-missing
double lastDifferentValue = Double.NaN;
String firstClassValueOfCurrentValue = null;
while (rowIterator.hasNext()) {
DataRow firstRow = rowIterator.next();
if (!firstRow.getCell(columnIndex).isMissing()) {
lastDifferentValue = ((DoubleValue) firstRow.getCell(columnIndex)).getDoubleValue();
// also remember the corresponding class value
firstClassValueOfCurrentValue = firstRow.getCell(m_classifyColumnIndex).toString();
break;
}
}
// needed to create a already passed candidate boundary due
// to a class value change
double lastChangeValueWithoutNewBoundary = Double.NaN;
// create the head of the linked double list
// marked by NaN
LinkedDouble head = new LinkedDouble(Double.NEGATIVE_INFINITY);
// set the last added element
LinkedDouble lastAdded = head;
// count the number of boundaries
int numBoundaries = 0;
// to determine if the class has changed during a single value sequence
boolean hasClassChanged = false;
while (rowIterator.hasNext()) {
DataRow row = rowIterator.next();
DataCell cell = row.getCell(columnIndex);
double value = ((DoubleValue) cell).getDoubleValue();
String classValue = row.getCell(m_classifyColumnIndex).toString();
if (!hasClassChanged && !firstClassValueOfCurrentValue.equals(classValue)) {
hasClassChanged = true;
// i.e. this value is not necessary any more
if (value != lastDifferentValue) {
lastChangeValueWithoutNewBoundary = Double.NaN;
}
}
// as long as the values do not change no boundary is added
if (value != lastDifferentValue) {
// since the last value change
if (hasClassChanged) {
// if the class value has changed since this time
if (!Double.isNaN(lastChangeValueWithoutNewBoundary)) {
// a new boundary is the midpoint
double newBoundary = (lastDifferentValue + lastChangeValueWithoutNewBoundary) / 2.0D;
// add the new midpoint boundary to the linked list
lastAdded.m_next = new LinkedDouble(newBoundary);
numBoundaries++;
lastAdded.m_next.m_previous = lastAdded;
lastAdded = lastAdded.m_next;
}
// a new boundary is the midpoint
double newBoundary = (value + lastDifferentValue) / 2.0D;
// add the new midpoint boundary to the linked list
lastAdded.m_next = new LinkedDouble(newBoundary);
numBoundaries++;
lastAdded.m_next.m_previous = lastAdded;
lastAdded = lastAdded.m_next;
// reset the value
lastChangeValueWithoutNewBoundary = Double.NaN;
} else {
lastChangeValueWithoutNewBoundary = lastDifferentValue;
}
// remember the value change
lastDifferentValue = value;
// remember the first class value of this first value
firstClassValueOfCurrentValue = classValue;
// reset the hasClassChanged value
hasClassChanged = false;
}
}
return new BoundaryScheme(head, numBoundaries);
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class NewJoinerNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
BufferedDataTable leftTable = inData[0];
BufferedDataTable rightTable = inData[1];
m_secondTableColIndex = rightTable.getDataTableSpec().findColumnIndex(m_settings.secondTableColumn());
if (!NewJoinerSettings.ROW_KEY_IDENTIFIER.equals(m_settings.secondTableColumn()) && (m_secondTableColIndex == -1)) {
throw new InvalidSettingsException("Join column '" + m_settings.secondTableColumn() + "' not found in second table");
}
BufferedDataContainer dc = exec.createDataContainer(createSpec(new DataTableSpec[] { leftTable.getDataTableSpec(), rightTable.getDataTableSpec() }));
// create a row with missing values for left or full outer joins
DataCell[] missingCells = new DataCell[rightTable.getDataTableSpec().getNumColumns()];
for (int i = 0; i < missingCells.length; i++) {
missingCells[i] = DataType.getMissingCell();
}
DataRow missingRow = new DefaultRow(new RowKey(""), missingCells);
exec.setMessage("Reading first table");
// build a map for sorting the second table which maps the row keys of
// the first table to their row number
final Map<String, Integer> orderMap = buildTableOrdering(leftTable, exec);
Comparator<DataRow> rowComparator = new Comparator<DataRow>() {
public int compare(final DataRow o1, final DataRow o2) {
Integer k1 = orderMap.get(getRightJoinKey(o1));
Integer k2 = orderMap.get(getRightJoinKey(o2));
if ((k1 != null) && (k2 != null)) {
return k1 - k2;
} else if (k1 != null) {
return -1;
} else if (k2 != null) {
return 1;
} else {
return 0;
}
}
};
// sort the second table based on the key order from the first table
// non-matching rows are placed at the end
exec.setMessage("Sorting second table");
SortedTable rightSortedTable = new SortedTable(rightTable, rowComparator, false, exec.createSubExecutionContext(0.7));
Iterator<DataRow> lit = leftTable.iterator();
Iterator<DataRow> rit = rightSortedTable.iterator();
exec.setMessage("Joining tables");
final double max;
boolean lofj = false;
boolean rofj = false;
if (JoinMode.InnerJoin.equals(m_settings.joinMode())) {
max = Math.min(leftTable.getRowCount(), rightTable.getRowCount());
} else if (JoinMode.LeftOuterJoin.equals(m_settings.joinMode())) {
max = leftTable.getRowCount();
lofj = true;
} else if (JoinMode.RightOuterJoin.equals(m_settings.joinMode())) {
max = rightTable.getRowCount();
rofj = true;
} else {
max = Math.max(leftTable.getRowCount(), rightTable.getRowCount());
lofj = true;
rofj = true;
}
// now join the two tables
int p = 0;
DataRow lrow = lit.hasNext() ? lit.next() : null;
DataRow rrow = rit.hasNext() ? rit.next() : null;
String lkey = (lrow != null) ? lrow.getKey().getString() : null;
String rkey = (rrow != null) ? getRightJoinKey(rrow) : null;
outer: while ((lrow != null) && (rrow != null)) {
exec.checkCanceled();
String key = lkey.toString();
if (lkey.equals(rkey)) {
// loop over all matching rows in the second table
for (int i = 0; lkey.equals(rkey); i++) {
dc.addRowToTable(createJoinedRow(key, lrow, rrow));
exec.setProgress(0.7 + 0.3 * p++ / max);
if (!rit.hasNext()) {
rrow = null;
break outer;
}
rrow = rit.next();
rkey = getRightJoinKey(rrow);
key = lkey.toString() + m_settings.keySuffix() + i;
}
} else if (lofj) {
// no matching row from right table => fill with missing values
// if left or full outer join is required
dc.addRowToTable(createJoinedRow(lkey.toString(), lrow, missingRow));
exec.setProgress(0.7 + 0.3 * p++ / max);
}
if (!lit.hasNext()) {
break outer;
}
lrow = lit.next();
lkey = lrow.getKey().getString();
}
if (lit.hasNext() && lofj) {
// outer join
while (lit.hasNext()) {
lrow = lit.next();
dc.addRowToTable(createJoinedRow(lrow.getKey().toString(), lrow, missingRow));
exec.setProgress(0.7 + 0.3 * p++ / max);
}
} else if ((rrow != null) && rofj) {
// add remaining non-joined rows from the right table if right or
// full outer join
missingCells = new DataCell[leftTable.getDataTableSpec().getNumColumns()];
for (int i = 0; i < missingCells.length; i++) {
missingCells[i] = DataType.getMissingCell();
}
missingRow = new DefaultRow(new RowKey(""), missingCells);
boolean warningSet = false;
while (true) {
String key = rrow.getKey().toString();
int c = 0;
while (true) {
try {
dc.addRowToTable(createJoinedRow(key, missingRow, rrow));
exec.setProgress(0.7 + 0.3 * p++ / max);
break;
} catch (DuplicateKeyException ex) {
if (++c > 10) {
throw ex;
}
key = key + "_r";
if (!warningSet) {
setWarningMessage("Encountered and fixed some " + "duplicate row keys at the end of the " + "table");
warningSet = true;
}
}
}
if (!rit.hasNext()) {
break;
}
rrow = rit.next();
}
}
dc.close();
return new BufferedDataTable[] { dc.getTable() };
}
Aggregations