use of org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme in project knime-core by knime.
the class BinModelPlotter method updatePaintModel.
/**
* {@inheritDoc}
*/
@Override
public synchronized void updatePaintModel() {
if (m_discretizationModel == null) {
return;
}
// clear the drawing pane
((BinModelDrawingPane) getDrawingPane()).setBinningSchemes(null);
// get the first columns
if (m_selectedColumns == null) {
m_selectedColumns = new LinkedHashSet<String>();
String[] binnedColumnNames = m_discretizationModel.getIncludedColumnNames();
for (int i = 0; i < binnedColumnNames.length; i++) {
// add them to the selected columns
m_selectedColumns.add(binnedColumnNames[i]);
}
((MultiColumnPlotterProperties) getProperties()).updateColumnSelection(m_binnedColumnsSpec, m_selectedColumns);
}
if (m_selectedColumns.size() == 0) {
getDrawingPane().repaint();
return;
}
Set<DataCell> selectedColumnCells = new LinkedHashSet<DataCell>();
m_coordinates = new ArrayList<Coordinate>();
List<Integer> columnIndices = new ArrayList<Integer>();
for (String name : m_selectedColumns) {
int idx = m_binnedColumnsSpec.findColumnIndex(name);
if (idx >= 0) {
selectedColumnCells.add(new StringCell(name));
DataColumnSpec colSpec = m_binnedColumnsSpec.getColumnSpec(idx);
columnIndices.add(idx);
Coordinate coordinate = Coordinate.createCoordinate(colSpec);
m_coordinates.add(coordinate);
}
}
// get the binning schemes for the selected columns
DiscretizationScheme[] selectedSchemes = getSelectedSchemes();
String[] selectedColumnNames = getSelectedColumnNames();
// calculate the display coordinates for the drawing pane
BinRuler[] binRulers = new BinRuler[selectedSchemes.length];
// determine the width available for a bin ruler
int rulerWidth = getDrawingPaneDimension().width - 2 * m_hMargin;
for (int i = 0; i < selectedSchemes.length; i++) {
double[] bounds = selectedSchemes[i].getBounds();
double min = bounds[0];
double max = bounds[bounds.length - 1];
// first create a colum spec from the schemes
DataColumnSpecCreator columnSpecCreator = new DataColumnSpecCreator("", DoubleCell.TYPE);
columnSpecCreator.setDomain(new DataColumnDomainCreator(new DoubleCell(min), new DoubleCell(max)).createDomain());
DoubleCoordinate coordinate = (DoubleCoordinate) Coordinate.createCoordinate(columnSpecCreator.createSpec());
Point leftStart = new Point(m_hMargin, m_vMargin + (i + 1) * m_columnDisplayHeight);
int[] binPositions = new int[bounds.length];
String[] binLabels = new String[bounds.length];
int count = 0;
for (double bound : bounds) {
binPositions[count] = (int) coordinate.calculateMappedValue(new DoubleCell(bound), rulerWidth, true);
binLabels[count] = coordinate.formatNumber(bounds[count]);
count++;
}
binRulers[i] = new BinRuler(leftStart, rulerWidth, binPositions, binLabels, selectedColumnNames[i]);
}
((BinModelDrawingPane) getDrawingPane()).setBinningSchemes(binRulers);
m_hMargin = 10;
m_vMargin = 10;
((BinModelDrawingPane) getDrawingPane()).setHorizontalMargin(m_hMargin);
setHeight(binRulers[binRulers.length - 1].getLeftStartPoint().y + 40);
}
use of org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme in project knime-core by knime.
the class CAIMDiscretizationNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
// measure the time
long startTime = System.currentTimeMillis();
// empty model
if (m_includedColumnNames.getIncludeList() == null || m_includedColumnNames.getIncludeList().size() == 0) {
return new PortObject[] { inData[0], new DiscretizationModel() };
}
LOGGER.debug("Start discretizing.");
// as the algorithm is for binary class problems only
// (positive, negative) the algorithm is performed for each class value
// labeled as positive class and the rest as negative
exec.setProgress(0.0, "Preparing...");
// check input data
BufferedDataTable data = (BufferedDataTable) inData[0];
// get class column index
m_classifyColumnIndex = data.getDataTableSpec().findColumnIndex(m_classColumnName.getStringValue());
assert m_classifyColumnIndex > -1;
// create the class - index mapping
createClassFromToIndexMaps(data.getDataTableSpec());
// create the array with the result discretization schemes for
// each included column
DiscretizationScheme[] resultSchemes = new DiscretizationScheme[m_includedColumnNames.getIncludeList().size()];
// for all included columns do the discretization
int currentColumn = 0;
for (String includedColumnName : m_includedColumnNames.getIncludeList()) {
LOGGER.debug("Process column: " + includedColumnName);
exec.setProgress("Discretizing column '" + includedColumnName + "'");
ExecutionContext subExecPerColumn = exec.createSubExecutionContext(1.0D / m_includedColumnNames.getIncludeList().size());
subExecPerColumn.checkCanceled();
// never discretize the column index (should never happen)
if (m_classColumnName.getStringValue().equals(includedColumnName)) {
continue;
}
// determine the column index of the current column
int columnIndex = data.getDataTableSpec().findColumnIndex(includedColumnName);
DataColumnDomain domain = data.getDataTableSpec().getColumnSpec(columnIndex).getDomain();
double minValue = ((DoubleValue) domain.getLowerBound()).getDoubleValue();
double maxValue = ((DoubleValue) domain.getUpperBound()).getDoubleValue();
// find all distinct values of the column and create
// a table with all possible interval boundaries (midpoint value of
// adjacent values)
subExecPerColumn.setProgress("Find possible boundaries.");
BoundaryScheme boundaryScheme = null;
// create subExec for sorting
ExecutionContext subExecSort = subExecPerColumn.createSubExecutionContext(0.1);
// long t1 = System.currentTimeMillis();
if (m_classOptimizedVersion) {
boundaryScheme = createAllIntervalBoundaries(data, columnIndex, subExecSort);
} else {
boundaryScheme = createAllIntervalBoundaries2(data, columnIndex, subExecSort);
}
subExecSort.setProgress(1.0D);
// long t2 = System.currentTimeMillis() - t1;
// LOGGER.error("Create boundaries time: " + (t2 / 1000.0)
// + " optimized: " + m_classOptimizedVersion);
// LOGGER.error("Boundaries: " + boundaryScheme.getHead());
LinkedDouble allIntervalBoundaries = boundaryScheme.getHead();
// create the initial discretization scheme
DiscretizationScheme discretizationScheme = new DiscretizationScheme(new Interval(minValue, maxValue, true, true));
double globalCAIM = 0;
// performe the iterative search for the best intervals
int numInsertedBounds = 0;
double currentCAIM = 0;
// create subExec for inserted bounds
ExecutionContext subExecBounds = subExecPerColumn.createSubExecutionContext(0.9);
while (currentCAIM > globalCAIM || numInsertedBounds < m_classValues.length - 1) {
subExecPerColumn.checkCanceled();
// create subExec for counting
ExecutionContext subExecCount = subExecBounds.createSubExecutionContext(1.0D / m_classValues.length);
// LOGGER.debug("Inserted bounds: " + numInsertedBounds);
// LOGGER.debug("intervall boundaries: " +
// allIntervalBoundaries);
// for all possible interval boundaries
// insert each one, calculate the caim value and add
// the one with the biggest caim
LinkedDouble intervalBoundary = allIntervalBoundaries.m_next;
currentCAIM = 0;
LinkedDouble bestBoundary = null;
long currentCountedBoundaries = 0;
while (intervalBoundary != null) {
subExecPerColumn.checkCanceled();
// set progress
currentCountedBoundaries++;
subExecCount.setProgress((double) currentCountedBoundaries / (double) boundaryScheme.getNumBoundaries(), "Count for possible boundary " + currentCountedBoundaries + " of " + boundaryScheme.getNumBoundaries());
// LOGGER.debug("current caim: " + currentCAIM);
DiscretizationScheme tentativeDS = new DiscretizationScheme(discretizationScheme);
tentativeDS.insertBound(intervalBoundary.m_value);
// create the quanta matrix
QuantaMatrix2D quantaMatrix = new QuantaMatrix2D(tentativeDS, m_classValueToIndexMap);
// pass the data for filling the matrix
quantaMatrix.countData(data, columnIndex, m_classifyColumnIndex);
// calculate the caim
double caim = quantaMatrix.calculateCaim();
if (caim > currentCAIM) {
currentCAIM = caim;
bestBoundary = intervalBoundary;
}
intervalBoundary = intervalBoundary.m_next;
}
// if there is no best boundary, break the first while loop
if (bestBoundary == null) {
break;
}
// in this case accept the best discretization scheme
if (currentCAIM > globalCAIM || numInsertedBounds < m_classValues.length) {
int numIntervals = discretizationScheme.getNumIntervals();
discretizationScheme.insertBound(bestBoundary.m_value);
// remove the linked list element from the list
bestBoundary.remove();
globalCAIM = currentCAIM;
if (numIntervals < discretizationScheme.getNumIntervals()) {
numInsertedBounds++;
subExecPerColumn.setProgress("Inserted bound " + numInsertedBounds);
// LOGGER.debug("Inserted boundary: "
// + bestBoundary.m_value);
} else {
throw new IllegalStateException("Only usefull bounds should be inserted: " + bestBoundary.m_value);
}
}
subExecCount.setProgress(1.0D);
}
resultSchemes[currentColumn] = discretizationScheme;
subExecBounds.setProgress(1.0D);
// ensure the full progress is set for this iteration
subExecPerColumn.setProgress(1.0D);
currentColumn++;
}
// set the model
DataTableSpec modelSpec = createModelSpec(m_includedColumnNames, data.getDataTableSpec());
m_discretizationModel = new DiscretizationModel(resultSchemes, modelSpec);
// create an output table that replaces the included columns by
// interval values
BufferedDataTable resultTable = createResultTable(exec, data, m_discretizationModel);
// log the runtime of the execute method
long runtime = System.currentTimeMillis() - startTime;
LOGGER.debug("Binning runtime: " + (runtime / 1000.0) + " sec.");
return new PortObject[] { resultTable, m_discretizationModel };
}
use of org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme in project knime-core by knime.
the class CAIMDiscretizationNodeModel method createResultTable.
/**
* Creates {@link BufferedDataTable} from a given input table and an
* appropriate {@link DiscretizationScheme}. The result table has replaced
* columns according to the {@link DiscretizationScheme}.
*
* @param exec the context from which to create the
* {@link BufferedDataTable}
* @param table the input data table
* @param discretizationModel the {@link DiscretizationModel} that contains
* the mapping from numerical intervals to nominal String values
* for the included columns
* @return the discretized input data
*/
public static BufferedDataTable createResultTable(final ExecutionContext exec, final BufferedDataTable table, final DiscretizationModel discretizationModel) {
DiscretizationScheme[] dSchemes = discretizationModel.getSchemes();
final String[] includedColumnNames = discretizationModel.getIncludedColumnNames();
// filter the schemes so that only schemes for columns are included
// which are also included in the table
dSchemes = filterNotKnownSchemes(dSchemes, includedColumnNames, table.getDataTableSpec());
DataTableSpec originalTableSpec = table.getDataTableSpec();
DataColumnSpec[] newColumnSpecs = new DataColumnSpec[originalTableSpec.getNumColumns()];
// remembers if an column index is included or not
boolean[] included = new boolean[newColumnSpecs.length];
int counter = 0;
for (DataColumnSpec originalColumnSpec : originalTableSpec) {
// if the column is included for discretizing, change the spec
if (isIncluded(originalColumnSpec, includedColumnNames) > -1) {
// creat a nominal string column spec
newColumnSpecs[counter] = new DataColumnSpecCreator(originalColumnSpec.getName(), StringCell.TYPE).createSpec();
included[counter] = true;
} else {
// add it as is
newColumnSpecs[counter] = originalColumnSpec;
included[counter] = false;
}
counter++;
}
// create the new table spec
DataTableSpec newTableSpec = new DataTableSpec(newColumnSpecs);
// create the result table
BufferedDataContainer container = exec.createDataContainer(newTableSpec);
// discretize the included column values
double rowCounter = 0;
double numRows = table.size();
for (DataRow row : table) {
if (rowCounter % 200 == 0) {
exec.setProgress(rowCounter / numRows);
}
int i = 0;
DataCell[] newCells = new DataCell[row.getNumCells()];
int includedCounter = 0;
for (DataCell cell : row) {
if (included[i]) {
// check for missing values
if (cell.isMissing()) {
newCells[i] = cell;
} else {
// transform the value to the discretized one
double value = ((DoubleValue) cell).getDoubleValue();
String discreteValue = dSchemes[includedCounter].getDiscreteValue(value);
newCells[i] = new StringCell(discreteValue);
}
includedCounter++;
} else {
newCells[i] = cell;
}
i++;
}
container.addRowToTable(new DefaultRow(row.getKey(), newCells));
rowCounter++;
}
container.close();
return container.getTable();
}
use of org.knime.base.node.preproc.discretization.caim2.DiscretizationScheme in project knime-core by knime.
the class BinModelPlotter method getSelectedSchemes.
/**
* Creates an array of {@link DiscretizationScheme}s that contains all
* schemes for the selected columns.
*
* @return the selected discretization schemes
*/
private DiscretizationScheme[] getSelectedSchemes() {
String[] includedColumns = m_discretizationModel.getIncludedColumnNames();
DiscretizationScheme[] result = new DiscretizationScheme[m_selectedColumns.size()];
int counter = 0;
for (String column : m_selectedColumns) {
for (int i = 0; i < includedColumns.length; i++) {
if (includedColumns[i].equals(column)) {
result[counter] = m_discretizationModel.getSchemes()[i];
counter++;
}
}
}
return result;
}
Aggregations