use of org.knime.core.data.sort.DataTableSorter in project knime-core by knime.
the class TreeDataCreator method readData.
/**
* Reads the data from <b>learnData</b> into memory.
* Each column is represented by a TreeColumnData object corresponding to its type
* and whether it is a attribute or target column.
*
* @param learnData
* @param configuration
* @param exec
* @return the TreeData object that holds all data in memory
* @throws CanceledExecutionException
*/
public TreeData readData(final BufferedDataTable learnData, final TreeEnsembleLearnerConfiguration configuration, final ExecutionMonitor exec) throws CanceledExecutionException {
if (learnData.size() <= 1) {
throw new IllegalArgumentException("The input table must contain at least 2 rows!");
}
int index = 0;
final long nrRows = learnData.size();
final int nrLearnCols = m_attrColCreators.length;
final boolean[] supportMissings = new boolean[nrLearnCols];
for (int i = 0; i < nrLearnCols; i++) {
supportMissings[i] = m_attrColCreators[i].acceptsMissing();
}
int rejectedMissings = 0;
final int nrHilitePatterns = m_configuration.getNrHilitePatterns();
// sort learnData according to the target column to enable equal size sampling
final int targetColIdx = learnData.getDataTableSpec().findColumnIndex(m_configuration.getTargetColumn());
Comparator<DataCell> targetComp = learnData.getDataTableSpec().getColumnSpec(targetColIdx).getType().getComparator();
DataTableSorter sorter = new DataTableSorter(learnData, learnData.size(), new Comparator<DataRow>() {
@Override
public int compare(final DataRow arg0, final DataRow arg1) {
return targetComp.compare(arg0.getCell(targetColIdx), arg1.getCell(targetColIdx));
}
});
final ExecutionMonitor sortExec = exec.createSubProgress(0.5);
final DataTable sortedTable = sorter.sort(sortExec);
final ExecutionMonitor readExec = exec.createSubProgress(0.5);
for (DataRow r : sortedTable) {
double progress = index / (double) nrRows;
readExec.setProgress(progress, "Row " + index + " of " + nrRows + " (\"" + r.getKey() + "\")");
readExec.checkCanceled();
boolean shouldReject = false;
for (int i = 0; i < nrLearnCols; i++) {
DataCell c = r.getCell(i);
if (c.isMissing() && !supportMissings[i]) {
shouldReject = true;
break;
}
}
DataCell targetCell = r.getCell(nrLearnCols);
if (targetCell.isMissing()) {
shouldReject = true;
}
if (shouldReject) {
rejectedMissings += 1;
continue;
}
if (index < nrHilitePatterns) {
m_dataRowsForHiliteContainer.addRowToTable(r);
}
final RowKey key = r.getKey();
for (int i = 0; i < nrLearnCols; i++) {
DataCell c = r.getCell(i);
m_attrColCreators[i].add(key, c);
}
m_targetColCreator.add(key, targetCell);
index++;
}
if (nrHilitePatterns > 0 && index > nrHilitePatterns) {
m_viewMessage = "Hilite (& color graphs) are based on a subset of " + "the data (" + nrHilitePatterns + "/" + index + ")";
}
if (rejectedMissings > 0) {
StringBuffer warnMsgBuilder = new StringBuffer();
warnMsgBuilder.append(rejectedMissings).append("/");
warnMsgBuilder.append(learnData.size());
warnMsgBuilder.append(" row(s) were ignored because they ");
warnMsgBuilder.append("contain missing values.");
m_warningMessage = warnMsgBuilder.toString();
}
CheckUtils.checkArgument(rejectedMissings < learnData.size(), "No rows left after removing missing values (table has %d row(s))", learnData.size());
int nrLearnAttributes = 0;
for (int i = 0; i < m_attrColCreators.length; i++) {
nrLearnAttributes += m_attrColCreators[i].getNrAttributes();
}
TreeAttributeColumnData[] columns = new TreeAttributeColumnData[nrLearnAttributes];
int learnAttributeIndex = 0;
for (int i = 0; i < m_attrColCreators.length; i++) {
TreeAttributeColumnDataCreator creator = m_attrColCreators[i];
for (int a = 0; a < creator.getNrAttributes(); a++) {
final TreeAttributeColumnData columnData = creator.createColumnData(a, configuration);
columnData.getMetaData().setAttributeIndex(learnAttributeIndex);
columns[learnAttributeIndex++] = columnData;
}
}
TreeTargetColumnData targetCol = m_targetColCreator.createColumnData();
return new TreeData(columns, targetCol, m_treeType);
}
use of org.knime.core.data.sort.DataTableSorter in project knime-core by knime.
the class TableSorterWorker method doInBackground.
/**
* {@inheritDoc}
*/
@Override
protected DataTable doInBackground() throws Exception {
// passed to table sorter for progress
long rowCount;
if (m_inputTable instanceof BufferedDataTable) {
rowCount = ((BufferedDataTable) m_inputTable).size();
} else if (m_inputTable instanceof ContainerTable) {
rowCount = ((ContainerTable) m_inputTable).size();
} else {
// unknown, no progress
rowCount = -1;
}
publish(new NodeProgress(0.0, "Starting table sort..."));
Collection<String> sortColNames = new ArrayList<String>(2);
DataTableSpec spec = m_inputTable.getDataTableSpec();
for (int i : m_sortOrder.getSortColumnIndices()) {
String name;
if (i < 0) {
// row id
name = DataTableSorter.ROWKEY_SORT_SPEC.getName();
} else {
name = spec.getColumnSpec(i).getName();
}
sortColNames.add(name);
}
long start = System.currentTimeMillis();
LOGGER.debug("Starting interactive table sorting on column(s) " + sortColNames);
boolean[] sortOrders = m_sortOrder.getSortColumnOrder();
// it DOES NOT respect blobs -- they will be copied (expensive)
DataTableSorter sorter = new DataTableSorter(m_inputTable, rowCount, sortColNames, sortOrders, false);
NodeProgressListener progLis = new NodeProgressListener() {
@Override
public void progressChanged(final NodeProgressEvent pe) {
publish(pe.getNodeProgress());
}
};
m_nodeProgressMonitor = new DefaultNodeProgressMonitor();
ExecutionMonitor exec = new ExecutionMonitor(m_nodeProgressMonitor);
m_nodeProgressMonitor.addProgressListener(progLis);
try {
DataTable result = sorter.sort(exec);
long elapsedMS = System.currentTimeMillis() - start;
String time = StringFormat.formatElapsedTime(elapsedMS);
LOGGER.debug("Interactive table sorting finished (" + time + ")");
return result;
} finally {
m_nodeProgressMonitor.removeProgressListener(progLis);
}
}
Aggregations