use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class AutoBinner method execute.
/**
* Determine bins.
*
* @param data the input data
* @param exec the execution context
* @return the operation with the discretisation information
* @throws Exception
*/
public PMMLPreprocDiscretize execute(final BufferedDataTable data, final ExecutionContext exec) throws Exception {
// Auto configuration when target is not set
final DataTableSpec spec = data.getDataTableSpec();
if (null == m_settings.getTargetColumn() || m_settings.getIncludeAll()) {
addAllNumericCols(spec);
}
// determine intervals
if (m_settings.getMethod().equals(Method.fixedNumber)) {
BufferedDataTable inData = calcDomainBoundsIfNeccessary(data, exec.createSubExecutionContext(0.9), Arrays.asList(m_settings.getTargetColumn()));
init(inData.getDataTableSpec());
Map<String, double[]> edgesMap = new HashMap<String, double[]>();
for (String target : m_settings.getTargetColumn()) {
DataTableSpec inSpec = inData.getDataTableSpec();
DataColumnSpec targetCol = inSpec.getColumnSpec(target);
// bounds of the domain
double min = ((DoubleValue) targetCol.getDomain().getLowerBound()).getDoubleValue();
double max = ((DoubleValue) targetCol.getDomain().getUpperBound()).getDoubleValue();
// the edges of the bins
double[] edges = new double[m_settings.getBinCount() + 1];
edges[0] = min;
edges[edges.length - 1] = max;
for (int i = 1; i < edges.length - 1; i++) {
edges[i] = min + i / (double) m_settings.getBinCount() * (max - min);
}
edgesMap.put(target, edges);
}
return createDisretizeOp(edgesMap);
} else if (m_settings.getMethod().equals(Method.sampleQuantiles)) {
init(spec);
Map<String, double[]> edgesMap = new LinkedHashMap<String, double[]>();
final int colCount = m_settings.getTargetColumn().length;
// contains all numeric columns if include all is set!
for (String target : m_settings.getTargetColumn()) {
exec.setMessage("Calculating quantiles (column \"" + target + "\")");
ExecutionContext colSortContext = exec.createSubExecutionContext(0.7 / colCount);
ExecutionContext colCalcContext = exec.createSubExecutionContext(0.3 / colCount);
ColumnRearranger singleRearranger = new ColumnRearranger(spec);
singleRearranger.keepOnly(target);
BufferedDataTable singleColSorted = colSortContext.createColumnRearrangeTable(data, singleRearranger, colSortContext);
SortedTable sorted = new SortedTable(singleColSorted, Collections.singletonList(target), new boolean[] { true }, colSortContext);
colSortContext.setProgress(1.0);
double[] edges = createEdgesFromQuantiles(sorted.getBufferedDataTable(), colCalcContext, m_settings.getSampleQuantiles());
colCalcContext.setProgress(1.0);
exec.clearTable(singleColSorted);
edgesMap.put(target, edges);
}
return createDisretizeOp(edgesMap);
} else {
throw new IllegalStateException("Unknown binning method.");
}
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class AutoBinner method execute.
/**
* Determine bins.
*
* @param data the input data
* @param exec the execution context
* @return the operation with the discretisation information
* @throws Exception ...
*/
public PMMLPreprocDiscretize execute(final BufferedDataTable data, final ExecutionContext exec) throws Exception {
// Auto configuration when target is not set
final DataTableSpec spec = data.getDataTableSpec();
if (null == m_settings.getTargetColumn() || m_settings.getIncludeAll()) {
addAllNumericCols(spec);
}
// determine intervals
if (m_settings.getMethod().equals(Method.fixedNumber)) {
if (m_settings.getEqualityMethod().equals(EqualityMethod.width)) {
BufferedDataTable inData = calcDomainBoundsIfNeccessary(data, exec.createSubExecutionContext(0.9), Arrays.asList(m_settings.getTargetColumn()));
init(inData.getDataTableSpec());
Map<String, double[]> edgesMap = new HashMap<String, double[]>();
for (String target : m_settings.getTargetColumn()) {
DataTableSpec inSpec = inData.getDataTableSpec();
DataColumnSpec targetCol = inSpec.getColumnSpec(target);
// bounds of the domain
double min = ((DoubleValue) targetCol.getDomain().getLowerBound()).getDoubleValue();
double max = ((DoubleValue) targetCol.getDomain().getUpperBound()).getDoubleValue();
// the edges of the bins
double[] edges = new double[m_settings.getBinCount() + 1];
edges[0] = min;
edges[edges.length - 1] = max;
for (int i = 1; i < edges.length - 1; i++) {
edges[i] = min + i / (double) m_settings.getBinCount() * (max - min);
}
if (m_settings.getIntegerBounds()) {
edges = toIntegerBounds(edges);
}
edgesMap.put(target, edges);
}
return createDisretizeOp(edgesMap);
} else {
// EqualityMethod.equalCount
Map<String, double[]> edgesMap = new HashMap<String, double[]>();
for (String target : m_settings.getTargetColumn()) {
int colIndex = data.getDataTableSpec().findColumnIndex(target);
List<Double> values = new ArrayList<Double>();
for (DataRow row : data) {
if (!row.getCell(colIndex).isMissing()) {
values.add(((DoubleValue) row.getCell(colIndex)).getDoubleValue());
}
}
edgesMap.put(target, findEdgesForEqualCount(values, m_settings.getBinCount()));
}
return createDisretizeOp(edgesMap);
}
} else if (m_settings.getMethod().equals(Method.sampleQuantiles)) {
init(spec);
Map<String, double[]> edgesMap = new LinkedHashMap<String, double[]>();
final int colCount = m_settings.getTargetColumn().length;
// contains all numeric columns if include all is set!
for (String target : m_settings.getTargetColumn()) {
exec.setMessage("Calculating quantiles (column \"" + target + "\")");
ExecutionContext colSortContext = exec.createSubExecutionContext(0.7 / colCount);
ExecutionContext colCalcContext = exec.createSubExecutionContext(0.3 / colCount);
ColumnRearranger singleRearranger = new ColumnRearranger(spec);
singleRearranger.keepOnly(target);
BufferedDataTable singleColSorted = colSortContext.createColumnRearrangeTable(data, singleRearranger, colSortContext);
SortedTable sorted = new SortedTable(singleColSorted, Collections.singletonList(target), new boolean[] { true }, colSortContext);
colSortContext.setProgress(1.0);
double[] edges = createEdgesFromQuantiles(sorted.getBufferedDataTable(), colCalcContext, m_settings.getSampleQuantiles());
colCalcContext.setProgress(1.0);
exec.clearTable(singleColSorted);
if (m_settings.getIntegerBounds()) {
edges = toIntegerBounds(edges);
}
edgesMap.put(target, edges);
}
return createDisretizeOp(edgesMap);
} else {
throw new IllegalStateException("Unknown binning method.");
}
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class ColumnToGridNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
String[] includes = m_configuration.getIncludes();
String groupColumn = m_configuration.getGroupColumn();
final ExecutionMonitor mainExec;
final BufferedDataTable inputTable;
if (groupColumn != null) {
exec.setMessage("Sorting input table");
BufferedDataTable in = inData[0];
ExecutionContext sortExec = exec.createSubExecutionContext(0.5);
ColumnRearranger sortFilterRearranger = new ColumnRearranger(in.getDataTableSpec());
String[] relevantCols = new String[includes.length + 1];
System.arraycopy(includes, 0, relevantCols, 0, includes.length);
relevantCols[relevantCols.length - 1] = groupColumn;
sortFilterRearranger.keepOnly(relevantCols);
BufferedDataTable toBeSortedTable = exec.createColumnRearrangeTable(in, sortFilterRearranger, exec.createSubProgress(0.0));
SortedTable sorter = new SortedTable(toBeSortedTable, Collections.singletonList(groupColumn), new boolean[] { true }, sortExec);
inputTable = sorter.getBufferedDataTable();
mainExec = exec.createSubProgress(0.5);
} else {
inputTable = inData[0];
mainExec = exec;
}
exec.setMessage("Assembling output");
DataTableSpec spec = inputTable.getDataTableSpec();
DataTableSpec outSpec = createOutputSpec(spec);
BufferedDataContainer cont = exec.createDataContainer(outSpec);
int[] includeIndices = new int[includes.length];
for (int i = 0; i < includes.length; i++) {
int index = spec.findColumnIndex(includes[i]);
includeIndices[i] = index;
}
int gridCount = m_configuration.getColCount();
final int cellCount;
final int groupColIndex;
if (groupColumn != null) {
cellCount = includeIndices.length * gridCount + 1;
groupColIndex = spec.findColumnIndex(groupColumn);
} else {
cellCount = includeIndices.length * gridCount;
groupColIndex = -1;
}
final DataCell[] cells = new DataCell[cellCount];
PushBackRowIterator it = new PushBackRowIterator(inputTable.iterator());
long currentRow = 0;
long totalRows = inputTable.size();
long currentOutRow = 0;
DataCell curGroupValue = null;
while (it.hasNext()) {
Arrays.fill(cells, DataType.getMissingCell());
// assign group column (if enabled)
if (groupColIndex >= 0) {
DataRow row = it.next();
curGroupValue = row.getCell(groupColIndex);
cells[cells.length - 1] = curGroupValue;
it.pushBack(row);
}
for (int grid = 0; grid < gridCount; grid++) {
if (!it.hasNext()) {
break;
}
DataRow inRow = it.next();
DataCell groupValue = groupColIndex < 0 ? null : inRow.getCell(groupColIndex);
if (ConvenienceMethods.areEqual(curGroupValue, groupValue)) {
mainExec.setProgress(currentRow / (double) totalRows, "Processing row " + currentRow + "/" + totalRows + ": " + inRow.getKey());
currentRow += 1;
mainExec.checkCanceled();
for (int i = 0; i < includeIndices.length; i++) {
cells[grid * includeIndices.length + i] = inRow.getCell(includeIndices[i]);
}
} else {
// start new group, i.e. new row
it.pushBack(inRow);
break;
}
}
RowKey key = RowKey.createRowKey(currentOutRow++);
cont.addRowToTable(new DefaultRow(key, cells));
}
cont.close();
return new BufferedDataTable[] { cont.getTable() };
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class ROCCalculator method calculateCurveData.
/**
* Calculates the ROC curve.
* @param table the table with the data
* @param exec the execution context to use for reporting progress
* @throws CanceledExecutionException when the user cancels the execution
*/
public void calculateCurveData(final BufferedDataTable table, final ExecutionContext exec) throws CanceledExecutionException {
m_warningMessage = null;
List<ROCCurve> curves = new ArrayList<ROCCurve>();
int classIndex = table.getDataTableSpec().findColumnIndex(m_classCol);
int curvesSize = m_curves.size();
int size = table.getRowCount();
if (size == 0) {
m_warningMessage = "Input table contains no rows";
}
BufferedDataContainer outCont = exec.createDataContainer(OUT_SPEC);
for (int i = 0; i < curvesSize; i++) {
exec.checkCanceled();
String c = m_curves.get(i);
ExecutionContext subExec = exec.createSubExecutionContext(1.0 / curvesSize);
SortedTable sortedTable = new SortedTable(table, Collections.singletonList(c), new boolean[] { false }, subExec);
subExec.setProgress(1.0);
int tp = 0, fp = 0;
// these contain the coordinates for the plot
double[] xValues = new double[size + 1];
double[] yValues = new double[size + 1];
int k = 0;
final int scoreColIndex = sortedTable.getDataTableSpec().findColumnIndex(c);
DataCell lastScore = null;
for (DataRow row : sortedTable) {
exec.checkCanceled();
DataCell realClass = row.getCell(classIndex);
if (realClass.isMissing() || row.getCell(scoreColIndex).isMissing()) {
if (m_ignoreMissingValues) {
continue;
} else {
m_warningMessage = "Table contains missing values.";
}
}
if (realClass.toString().equals(m_posClass)) {
tp++;
} else {
fp++;
}
// around ... the following lines circumvent this.
if (!row.getCell(scoreColIndex).equals(lastScore)) {
k++;
lastScore = row.getCell(scoreColIndex);
}
xValues[k] = fp;
yValues[k] = tp;
}
xValues = Arrays.copyOf(xValues, k + 1);
yValues = Arrays.copyOf(yValues, k + 1);
for (int j = 0; j <= k; j++) {
xValues[j] /= fp;
yValues[j] /= tp;
}
xValues[xValues.length - 1] = 1;
yValues[yValues.length - 1] = 1;
double area = 0;
for (k = 1; k < xValues.length; k++) {
if (xValues[k - 1] < xValues[k]) {
// magical math: the rectangle + the triangle under
// the segment xValues[k] to xValues[k - 1]
area += 0.5 * (xValues[k] - xValues[k - 1]) * (yValues[k] + yValues[k - 1]);
}
}
curves.add(new ROCCurve(c, xValues, yValues, area, m_maxPoints));
outCont.addRowToTable(new DefaultRow(new RowKey(c.toString()), new DoubleCell(area)));
}
m_outCurves = curves;
outCont.close();
m_outTable = outCont.getTable();
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class TargetShufflingNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
final int colIndex = inData[0].getDataTableSpec().findColumnIndex(m_settings.columnName());
final String colName = inData[0].getDataTableSpec().getColumnSpec(colIndex).getName();
// create a new column rearranger from the input table
ColumnRearranger colRe = new ColumnRearranger(inData[0].getDataTableSpec());
for (DataColumnSpec c : inData[0].getDataTableSpec()) {
if (!c.getName().equals(colName)) {
// remove all columns except the selected one
colRe.remove(c.getName());
}
}
// append a new column with a random number for each cell
String uniqueColumnName = DataTableSpec.getUniqueColumnName(inData[0].getDataTableSpec(), "random_col");
colRe.append(new SingleCellFactory(new DataColumnSpecCreator(uniqueColumnName, LongCell.TYPE).createSpec()) {
@Override
public DataCell getCell(final DataRow row) {
return new LongCell(m_random.nextLong());
}
});
BufferedDataTable toSort = exec.createColumnRearrangeTable(exec.createBufferedDataTable(inData[0], exec), colRe, exec.createSilentSubProgress(.2));
// sort the random numbers ---> shuffles the sorted column
List<String> include = new ArrayList<String>();
include.add(toSort.getDataTableSpec().getColumnSpec(1).getName());
SortedTable sort = new SortedTable(toSort, include, new boolean[] { true }, exec.createSubExecutionContext(.6));
final BufferedDataTable sorted = sort.getBufferedDataTable();
// replace the selected column with the shuffled one
final DataColumnSpec colSpec = inData[0].getDataTableSpec().getColumnSpec(colIndex);
ColumnRearranger crea = new ColumnRearranger(inData[0].getDataTableSpec());
crea.replace(new SingleCellFactory(colSpec) {
private final CloseableRowIterator m_iterator = sorted.iterator();
@Override
public DataCell getCell(final DataRow row) {
return m_iterator.next().getCell(0);
}
}, colName);
return new BufferedDataTable[] { exec.createColumnRearrangeTable(inData[0], crea, exec.createSubProgress(0.2)) };
}
Aggregations