use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class LiftChartNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
ConvenienceMethods.checkTableSize(inData[0]);
int predColIndex = inData[0].getDataTableSpec().findColumnIndex(m_responseColumn.getStringValue());
List<String> inclList = new LinkedList<String>();
inclList.add(m_probabilityColumn.getStringValue());
boolean[] order = new boolean[] { false };
SortedTable st = new SortedTable(inData[0], inclList, order, exec);
long totalResponses = 0;
double partWidth = Double.parseDouble(m_intervalWidth.getStringValue());
int nrParts = (int) Math.ceil(100.0 / partWidth);
List<Integer> positiveResponses = new LinkedList<Integer>();
int rowIndex = 0;
for (DataRow row : st) {
if (row.getCell(predColIndex).isMissing()) {
setWarningMessage("There are missing values." + " Please check your data.");
continue;
}
String response = ((StringValue) row.getCell(predColIndex)).getStringValue().trim();
if (response.equalsIgnoreCase(m_responseLabel.getStringValue())) {
totalResponses++;
positiveResponses.add(rowIndex);
}
rowIndex++;
}
int[] counter = new int[nrParts];
int partWidthAbsolute = (int) Math.ceil(rowIndex / (double) nrParts);
double avgResponse = (double) positiveResponses.size() / rowIndex;
for (int rIndex : positiveResponses) {
int index = rIndex / partWidthAbsolute;
counter[index]++;
}
DataColumnSpec[] colSpec = new DataColumnSpec[3];
colSpec[0] = new DataColumnSpecCreator("Lift", DoubleCell.TYPE).createSpec();
colSpec[1] = new DataColumnSpecCreator("Baseline", DoubleCell.TYPE).createSpec();
colSpec[2] = new DataColumnSpecCreator("Cumulative Lift", DoubleCell.TYPE).createSpec();
DataTableSpec tableSpec = new DataTableSpec(colSpec);
DataContainer cont = exec.createDataContainer(tableSpec);
colSpec = new DataColumnSpec[2];
colSpec[0] = new DataColumnSpecCreator("Actual", DoubleCell.TYPE).createSpec();
colSpec[1] = new DataColumnSpecCreator("Baseline", DoubleCell.TYPE).createSpec();
tableSpec = new DataTableSpec(colSpec);
DataContainer responseCont = exec.createDataContainer(tableSpec);
long cumulativeCounter = 0;
responseCont.addRowToTable(new DefaultRow(new RowKey("0"), 0.0, 0.0));
for (int i = 0; i < counter.length; i++) {
cumulativeCounter += counter[i];
double responseRate = (double) counter[i] / partWidthAbsolute;
double lift = responseRate / avgResponse;
double cumResponseRate = (double) cumulativeCounter / totalResponses;
long number = partWidthAbsolute * (i + 1);
// well.. rounding problems
if (number > rowIndex) {
number = rowIndex;
}
double cumulativeLift = // (double)cumulativeCounter / (partWidthAbsolute * (i + 1));
(double) cumulativeCounter / number;
cumulativeLift /= avgResponse;
// cumulativeLift = lifts / (i+1);
double rowKey = ((i + 1) * partWidth);
if (rowKey > 100) {
rowKey = 100;
}
cont.addRowToTable(new DefaultRow(new RowKey("" + rowKey), lift, 1.0, cumulativeLift));
double cumBaseline = (i + 1) * partWidth;
if (cumBaseline > 100) {
cumBaseline = 100;
}
responseCont.addRowToTable(new DefaultRow(new RowKey("" + rowKey), cumResponseRate * 100, cumBaseline));
}
cont.close();
responseCont.close();
m_dataArray[0] = new DefaultDataArray(cont.getTable(), 1, (int) cont.size());
m_dataArray[1] = new DefaultDataArray(responseCont.getTable(), 1, (int) responseCont.size());
return new BufferedDataTable[] { st.getBufferedDataTable() };
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class CAIMDiscretizationNodeModel method createAllIntervalBoundaries2.
/**
* Sorts the data table in ascending order on the given column, then all
* distinct values are determined and finally a new table is created that
* holds the minimum, the maximum value and the midpoints of all adjacent
* values. These represent all possible boundaries.
*
* @param table the table with the data
* @param columnIndex the column of interest
* @param exec the execution context to set the progress
*/
private BoundaryScheme createAllIntervalBoundaries2(final BufferedDataTable table, final int columnIndex, final ExecutionContext exec) throws Exception {
// sort the data accordint to the column index
List<String> sortColumn = new ArrayList<String>();
sortColumn.add(table.getDataTableSpec().getColumnSpec(columnIndex).getName());
boolean[] sortOrder = new boolean[1];
// in ascending order
sortOrder[0] = true;
SortedTable sortedTable = new SortedTable(table, sortColumn, sortOrder, true, exec);
// the first different value is the minimum value of the sorted list
RowIterator rowIterator = sortedTable.iterator();
double lastDifferentValue = ((DoubleValue) rowIterator.next().getCell(columnIndex)).getDoubleValue();
// create the head of the linked double list
// marked by NaN
LinkedDouble head = new LinkedDouble(Double.NEGATIVE_INFINITY);
// set the last added element
LinkedDouble lastAdded = head;
// count the number of boundaries
int numBoundaries = 0;
while (rowIterator.hasNext()) {
DataRow row = rowIterator.next();
DataCell cell = row.getCell(columnIndex);
double value = ((DoubleValue) cell).getDoubleValue();
if (value != lastDifferentValue) {
// a new boundary is the midpoint
double newBoundary = (value + lastDifferentValue) / 2.0D;
lastDifferentValue = value;
// add the new midpoint boundary to the linked list
lastAdded.m_next = new LinkedDouble(newBoundary);
numBoundaries++;
lastAdded.m_next.m_previous = lastAdded;
lastAdded = lastAdded.m_next;
}
}
return new BoundaryScheme(head, numBoundaries);
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class LiftCalculator method calculateLiftTables.
/**
* Calculates the tables necessary for displaying a lift chart.
* @param table the data table
* @param exec the execution context to report progress to
* @return warning messages or null
* @throws CanceledExecutionException when the user cancels the execution
*/
public String calculateLiftTables(final BufferedDataTable table, final ExecutionContext exec) throws CanceledExecutionException {
int predColIndex = table.getDataTableSpec().findColumnIndex(m_responseColumn);
String warning = null;
List<String> inclList = new LinkedList<String>();
inclList.add(m_probabilityColumn);
int probColInd = table.getDataTableSpec().findColumnIndex(m_probabilityColumn);
boolean[] order = new boolean[] { false };
m_sorted = new SortedTable(table, inclList, order, exec);
long totalResponses = 0;
double partWidth = m_intervalWidth;
int nrParts = (int) Math.ceil(100.0 / partWidth);
List<Integer> positiveResponses = new LinkedList<Integer>();
int rowIndex = 0;
for (DataRow row : m_sorted) {
if (row.getCell(predColIndex).isMissing() || row.getCell(probColInd).isMissing()) {
if (row.getCell(predColIndex).isMissing()) {
// miss. values in class column we always ignore
continue;
}
if (m_ignoreMissingValues) {
continue;
} else {
warning = "Table contains missing values.";
}
}
String response = ((StringValue) row.getCell(predColIndex)).getStringValue().trim();
if (response.equalsIgnoreCase(m_responseLabel)) {
totalResponses++;
positiveResponses.add(rowIndex);
}
rowIndex++;
}
int[] counter = new int[nrParts];
int partWidthAbsolute = (int) Math.ceil(rowIndex / (double) nrParts);
double avgResponse = (double) positiveResponses.size() / rowIndex;
for (int rIndex : positiveResponses) {
int index = rIndex / partWidthAbsolute;
counter[index]++;
}
DataColumnSpec[] colSpec = new DataColumnSpec[3];
colSpec[0] = new DataColumnSpecCreator("Lift", DoubleCell.TYPE).createSpec();
colSpec[1] = new DataColumnSpecCreator("Baseline", DoubleCell.TYPE).createSpec();
colSpec[2] = new DataColumnSpecCreator("Cumulative Lift", DoubleCell.TYPE).createSpec();
DataTableSpec tableSpec = new DataTableSpec(colSpec);
// new DataContainer(tableSpec);
DataContainer cont = exec.createDataContainer(tableSpec);
colSpec = new DataColumnSpec[2];
colSpec[0] = new DataColumnSpecCreator("Actual", DoubleCell.TYPE).createSpec();
colSpec[1] = new DataColumnSpecCreator("Baseline", DoubleCell.TYPE).createSpec();
tableSpec = new DataTableSpec(colSpec);
// new DataContainer(tableSpec);
DataContainer responseCont = exec.createDataContainer(tableSpec);
long cumulativeCounter = 0;
responseCont.addRowToTable(new DefaultRow(new RowKey("0"), 0.0, 0.0));
for (int i = 0; i < counter.length; i++) {
cumulativeCounter += counter[i];
double responseRate = (double) counter[i] / partWidthAbsolute;
double lift = responseRate / avgResponse;
double cumResponseRate = (double) cumulativeCounter / totalResponses;
long number = partWidthAbsolute * (i + 1);
// well.. rounding problems
if (number > rowIndex) {
number = rowIndex;
}
double cumulativeLift = // (double)cumulativeCounter / (partWidthAbsolute * (i + 1));
(double) cumulativeCounter / number;
cumulativeLift /= avgResponse;
// cumulativeLift = lifts / (i+1);
double rowKey = ((i + 1) * partWidth);
if (rowKey > 100) {
rowKey = 100;
}
cont.addRowToTable(new DefaultRow(new RowKey("" + rowKey), lift, 1.0, cumulativeLift));
double cumBaseline = (i + 1) * partWidth;
if (cumBaseline > 100) {
cumBaseline = 100;
}
responseCont.addRowToTable(new DefaultRow(new RowKey("" + rowKey), cumResponseRate * 100, cumBaseline));
}
cont.close();
responseCont.close();
m_lift = (BufferedDataTable) cont.getTable();
m_response = (BufferedDataTable) responseCont.getTable();
return warning;
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class BoxplotCalculator method calculateMultiple.
/**
* Calculates the necessary statistics for a non-conditional boxplot.
* @param table the input data
* @param numCol array of names of numeric columns to plot
* @param exec Execution context to report progress to
* @return LinkedHashMap with the column name as key and statistics as value
* @throws CanceledExecutionException when the user cancels the execution
*/
public LinkedHashMap<String, BoxplotStatistics> calculateMultiple(final BufferedDataTable table, final String[] numCol, final ExecutionContext exec) throws CanceledExecutionException {
DataTableSpec spec = table.getSpec();
int[] numColIdxs = new int[numCol.length];
for (int i = 0; i < numCol.length; i++) {
numColIdxs[i] = spec.findColumnIndex(numCol[i]);
}
LinkedHashMap<String, DataContainer> containers = new LinkedHashMap<String, DataContainer>();
for (int i = 0; i < numCol.length; i++) {
containers.put(numCol[i], exec.createDataContainer(new DataTableSpec(new String[] { "col" }, new DataType[] { DoubleCell.TYPE })));
}
ExecutionContext subExec = exec.createSilentSubExecutionContext(0.7);
long[] numMissValPerCol = new long[numCol.length];
int count = 0;
for (DataRow row : table) {
exec.checkCanceled();
subExec.setProgress((double) count++ / table.size());
for (int i = 0; i < numCol.length; i++) {
DataCell cell = row.getCell(numColIdxs[i]);
if (!cell.isMissing()) {
containers.get(numCol[i]).addRowToTable(new DefaultRow(row.getKey(), cell));
} else {
numMissValPerCol[i]++;
}
}
}
LinkedHashMap<String, BoxplotStatistics> statsMap = new LinkedHashMap<>();
ExecutionContext subExec2 = exec.createSilentSubExecutionContext(1.0);
count = 0;
List<String> excludedDataColList = new ArrayList<String>();
for (Entry<String, DataContainer> entry : containers.entrySet()) {
exec.checkCanceled();
subExec2.setProgress((double) count++ / containers.size());
Set<Outlier> extremeOutliers = new HashSet<Outlier>();
Set<Outlier> mildOutliers = new HashSet<Outlier>();
entry.getValue().close();
BufferedDataTable catTable = (BufferedDataTable) entry.getValue().getTable();
if (catTable.size() == 0) {
excludedDataColList.add(entry.getKey());
continue;
}
SortedTable st = new SortedTable(catTable, new Comparator<DataRow>() {
@Override
public int compare(final DataRow o1, final DataRow o2) {
DataCell c1 = o1.getCell(0);
DataCell c2 = o2.getCell(0);
double d1 = ((DoubleValue) c1).getDoubleValue();
double d2 = ((DoubleValue) c2).getDoubleValue();
if (d1 == d2) {
return 0;
} else {
return d1 < d2 ? -1 : 1;
}
}
}, false, exec);
double min = 0, max = 0, q1 = 0, q3 = 0, median = 0;
boolean dq1 = catTable.size() % 4 == 0;
long q1Idx = catTable.size() / 4;
boolean dq3 = 3 * catTable.size() % 4 == 0;
long q3Idx = 3 * catTable.size() / 4;
boolean dMedian = catTable.size() % 2 == 0;
long medianIdx = catTable.size() / 2;
int counter = 0;
for (DataRow row : st) {
double val = ((DoubleValue) row.getCell(0)).getDoubleValue();
if (counter == 0) {
min = val;
}
if (counter == catTable.size() - 1) {
max = val;
}
if (counter == q1Idx - 1 && dq1) {
q1 = val;
}
if (counter == q1Idx || (counter == 0 && st.size() <= 3)) {
if (dq1) {
q1 = (q1 + val) / 2.0;
} else {
q1 = val;
}
}
if (counter == medianIdx - 1 && dMedian) {
median = val;
}
if (counter == medianIdx) {
if (dMedian) {
median = (median + val) / 2;
} else {
median = val;
}
}
if (counter == q3Idx - 1 && dq3) {
q3 = val;
}
if (counter == q3Idx || (counter == st.size() - 1 && st.size() <= 3)) {
if (dq3) {
q3 = (q3 + val) / 2.0;
} else {
q3 = val;
}
}
counter++;
}
double iqr = q3 - q1;
double lowerWhisker = min;
double upperWhisker = max;
double upperWhiskerFence = q3 + (1.5 * iqr);
double lowerWhiskerFence = q1 - (1.5 * iqr);
double lowerFence = q1 - (3 * iqr);
double upperFence = q3 + (3 * iqr);
for (DataRow row : st) {
double value = ((DoubleValue) row.getCell(0)).getDoubleValue();
String rowKey = row.getKey().getString();
if (value < lowerFence) {
extremeOutliers.add(new Outlier(value, rowKey));
} else if (value < lowerWhiskerFence) {
mildOutliers.add(new Outlier(value, rowKey));
} else if (lowerWhisker < lowerWhiskerFence && value >= lowerWhiskerFence) {
lowerWhisker = value;
} else if (value <= upperWhiskerFence) {
upperWhisker = value;
} else if (value > upperFence) {
extremeOutliers.add(new Outlier(value, rowKey));
} else if (value > upperWhiskerFence) {
mildOutliers.add(new Outlier(value, rowKey));
}
}
statsMap.put(entry.getKey(), new BoxplotStatistics(mildOutliers, extremeOutliers, min, max, lowerWhisker, q1, median, q3, upperWhisker));
}
// missing values part
m_excludedDataCols = excludedDataColList.toArray(new String[excludedDataColList.size()]);
m_numMissValPerCol = new LinkedHashMap<String, Long>();
for (int i = 0; i < numCol.length; i++) {
if (numMissValPerCol[i] > 0 && !excludedDataColList.contains(numCol[i])) {
m_numMissValPerCol.put(numCol[i], numMissValPerCol[i]);
}
}
return statsMap;
}
use of org.knime.base.data.sort.SortedTable in project knime-core by knime.
the class ColumnToGrid2NodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
String groupColumn = m_configuration.getGroupColumn();
final ExecutionMonitor mainExec;
final BufferedDataTable inputTable;
if (groupColumn != null) {
exec.setMessage("Sorting input table");
BufferedDataTable in = inData[0];
ExecutionContext sortExec = exec.createSubExecutionContext(0.5);
ColumnRearranger sortFilterRearranger = new ColumnRearranger(in.getDataTableSpec());
String[] relevantCols = new String[m_included.length + 1];
System.arraycopy(m_included, 0, relevantCols, 0, m_included.length);
relevantCols[relevantCols.length - 1] = groupColumn;
sortFilterRearranger.keepOnly(relevantCols);
BufferedDataTable toBeSortedTable = exec.createColumnRearrangeTable(in, sortFilterRearranger, exec.createSubProgress(0.0));
SortedTable sorter = new SortedTable(toBeSortedTable, Collections.singletonList(groupColumn), new boolean[] { true }, sortExec);
inputTable = sorter.getBufferedDataTable();
mainExec = exec.createSubProgress(0.5);
} else {
inputTable = inData[0];
mainExec = exec;
}
exec.setMessage("Assembling output");
DataTableSpec spec = inputTable.getDataTableSpec();
DataTableSpec outSpec = createOutputSpec(spec);
BufferedDataContainer cont = exec.createDataContainer(outSpec);
int[] includeIndices = new int[m_included.length];
for (int i = 0; i < m_included.length; i++) {
int index = spec.findColumnIndex(m_included[i]);
includeIndices[i] = index;
}
int gridCount = m_configuration.getColCount();
final int cellCount;
final int groupColIndex;
if (groupColumn != null) {
cellCount = includeIndices.length * gridCount + 1;
groupColIndex = spec.findColumnIndex(groupColumn);
} else {
cellCount = includeIndices.length * gridCount;
groupColIndex = -1;
}
final DataCell[] cells = new DataCell[cellCount];
PushBackRowIterator it = new PushBackRowIterator(inputTable.iterator());
long currentRow = 0;
long totalRows = inputTable.size();
long currentOutRow = 0;
DataCell curGroupValue = null;
while (it.hasNext()) {
Arrays.fill(cells, DataType.getMissingCell());
// assign group column (if enabled)
if (groupColIndex >= 0) {
DataRow row = it.next();
curGroupValue = row.getCell(groupColIndex);
cells[cells.length - 1] = curGroupValue;
it.pushBack(row);
}
for (int grid = 0; grid < gridCount; grid++) {
if (!it.hasNext()) {
break;
}
DataRow inRow = it.next();
DataCell groupValue = groupColIndex < 0 ? null : inRow.getCell(groupColIndex);
if (ConvenienceMethods.areEqual(curGroupValue, groupValue)) {
mainExec.setProgress(currentRow / (double) totalRows, "Processing row " + currentRow + "/" + totalRows + ": " + inRow.getKey());
currentRow += 1;
mainExec.checkCanceled();
for (int i = 0; i < includeIndices.length; i++) {
cells[grid * includeIndices.length + i] = inRow.getCell(includeIndices[i]);
}
} else {
// start new group, i.e. new row
it.pushBack(inRow);
break;
}
}
RowKey key = RowKey.createRowKey(currentOutRow++);
cont.addRowToTable(new DefaultRow(key, cells));
}
cont.close();
return new BufferedDataTable[] { cont.getTable() };
}
Aggregations