use of org.knime.core.data.DataColumnDomain in project knime-core by knime.
the class RPropNodeModel method configure.
/**
* returns null.
*
* {@inheritDoc}
*/
@Override
protected PortObjectSpec[] configure(final PortObjectSpec[] inSpecs) throws InvalidSettingsException {
if (m_classcol.getStringValue() != null) {
List<String> learningCols = new LinkedList<String>();
List<String> targetCols = new LinkedList<String>();
boolean classcolinspec = false;
for (DataColumnSpec colspec : (DataTableSpec) inSpecs[INDATA]) {
if (!(colspec.getName().toString().compareTo(m_classcol.getStringValue()) == 0)) {
if (!colspec.getType().isCompatible(DoubleValue.class)) {
throw new InvalidSettingsException("Only double columns for input");
} else {
learningCols.add(colspec.getName());
DataColumnDomain domain = colspec.getDomain();
if (domain.hasBounds()) {
double lower = ((DoubleValue) domain.getLowerBound()).getDoubleValue();
double upper = ((DoubleValue) domain.getUpperBound()).getDoubleValue();
if (lower < 0 || upper > 1) {
setWarningMessage("Input data not normalized." + " Please consider using the " + "Normalizer Node first.");
}
}
}
} else {
targetCols.add(colspec.getName());
classcolinspec = true;
// TODO: Check what happens to other values than double
if (colspec.getType().isCompatible(DoubleValue.class)) {
// check if the values are in range [0,1]
DataColumnDomain domain = colspec.getDomain();
if (domain.hasBounds()) {
double lower = ((DoubleValue) domain.getLowerBound()).getDoubleValue();
double upper = ((DoubleValue) domain.getUpperBound()).getDoubleValue();
if (lower < 0 || upper > 1) {
throw new InvalidSettingsException("Domain range for regression in column " + colspec.getName() + " not in range [0,1]");
}
}
}
}
}
if (!classcolinspec) {
throw new InvalidSettingsException("Class column " + m_classcol.getStringValue() + " not found in DataTableSpec");
}
return new PortObjectSpec[] { createPMMLPortObjectSpec(m_pmmlInEnabled ? (PMMLPortObjectSpec) inSpecs[1] : null, (DataTableSpec) inSpecs[0], learningCols, targetCols) };
} else {
throw new InvalidSettingsException("Class column not set");
}
}
use of org.knime.core.data.DataColumnDomain in project knime-core by knime.
the class EditNominalDomainNodeModel method sortPossibleValues.
private DataTableSpec sortPossibleValues(final DataTableSpec orgSpec) throws InvalidSettingsException {
if (m_configuration == null) {
throw new InvalidSettingsException("Missing Configuration.");
}
Set<String> configuredColumns = new HashSet<String>(m_configuration.getConfiguredColumns());
String[] columnNames = orgSpec.getColumnNames();
DataTableSpecCreator creator = new DataTableSpecCreator(orgSpec).dropAllColumns();
for (int i = 0; i < orgSpec.getNumColumns(); i++) {
String name = columnNames[i];
if (configuredColumns.remove(name)) {
DataColumnSpec orgDataSpec = orgSpec.getColumnSpec(i);
if (!StringCell.TYPE.equals(orgDataSpec.getType())) {
CheckUtils.checkSetting(m_configuration.isIgnoreWrongTypes(), "Column '%s' must be of type '%s' \nbut was of type: '%s'", name, StringCell.TYPE, orgDataSpec.getType());
creator.addColumns(orgDataSpec);
} else {
DataColumnDomain domain = orgDataSpec.getDomain();
DataColumnSpecCreator dataColumnSpecCreator = new DataColumnSpecCreator(orgDataSpec);
DataColumnDomainCreator yetAnotherCreator = new DataColumnDomainCreator(domain.getLowerBound(), domain.getUpperBound());
List<DataCell> sorting = new ArrayList<DataCell>(m_configuration.getSorting(name));
Set<DataCell> difference = diff(domain.getValues(), sorting);
yetAnotherCreator.setValues(resolveNewValues(sorting, difference));
dataColumnSpecCreator.setDomain(yetAnotherCreator.createDomain());
creator.addColumns(dataColumnSpecCreator.createSpec());
}
} else {
creator.addColumns(orgSpec.getColumnSpec(i));
}
}
if (!configuredColumns.isEmpty()) {
String missingColumnsString = "Following columns are configured but no longer exist: \n" + ConvenienceMethods.getShortStringFrom(configuredColumns, 5);
CheckUtils.checkSetting(m_configuration.isIgnoreNotExistingColumns(), missingColumnsString);
setWarningMessage(missingColumnsString);
}
return creator.createSpec();
}
use of org.knime.core.data.DataColumnDomain in project knime-core by knime.
the class CAIMDiscretizationNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
// measure the time
long startTime = System.currentTimeMillis();
// empty model
if (m_includedColumnNames.getIncludeList() == null || m_includedColumnNames.getIncludeList().size() == 0) {
return new PortObject[] { inData[0], new DiscretizationModel() };
}
LOGGER.debug("Start discretizing.");
// as the algorithm is for binary class problems only
// (positive, negative) the algorithm is performed for each class value
// labeled as positive class and the rest as negative
exec.setProgress(0.0, "Preparing...");
// check input data
BufferedDataTable data = (BufferedDataTable) inData[0];
// get class column index
m_classifyColumnIndex = data.getDataTableSpec().findColumnIndex(m_classColumnName.getStringValue());
assert m_classifyColumnIndex > -1;
// create the class - index mapping
createClassFromToIndexMaps(data.getDataTableSpec());
// create the array with the result discretization schemes for
// each included column
DiscretizationScheme[] resultSchemes = new DiscretizationScheme[m_includedColumnNames.getIncludeList().size()];
// for all included columns do the discretization
int currentColumn = 0;
for (String includedColumnName : m_includedColumnNames.getIncludeList()) {
LOGGER.debug("Process column: " + includedColumnName);
exec.setProgress("Discretizing column '" + includedColumnName + "'");
ExecutionContext subExecPerColumn = exec.createSubExecutionContext(1.0D / m_includedColumnNames.getIncludeList().size());
subExecPerColumn.checkCanceled();
// never discretize the column index (should never happen)
if (m_classColumnName.getStringValue().equals(includedColumnName)) {
continue;
}
// determine the column index of the current column
int columnIndex = data.getDataTableSpec().findColumnIndex(includedColumnName);
DataColumnDomain domain = data.getDataTableSpec().getColumnSpec(columnIndex).getDomain();
double minValue = ((DoubleValue) domain.getLowerBound()).getDoubleValue();
double maxValue = ((DoubleValue) domain.getUpperBound()).getDoubleValue();
// find all distinct values of the column and create
// a table with all possible interval boundaries (midpoint value of
// adjacent values)
subExecPerColumn.setProgress("Find possible boundaries.");
BoundaryScheme boundaryScheme = null;
// create subExec for sorting
ExecutionContext subExecSort = subExecPerColumn.createSubExecutionContext(0.1);
// long t1 = System.currentTimeMillis();
if (m_classOptimizedVersion) {
boundaryScheme = createAllIntervalBoundaries(data, columnIndex, subExecSort);
} else {
boundaryScheme = createAllIntervalBoundaries2(data, columnIndex, subExecSort);
}
subExecSort.setProgress(1.0D);
// long t2 = System.currentTimeMillis() - t1;
// LOGGER.error("Create boundaries time: " + (t2 / 1000.0)
// + " optimized: " + m_classOptimizedVersion);
// LOGGER.error("Boundaries: " + boundaryScheme.getHead());
LinkedDouble allIntervalBoundaries = boundaryScheme.getHead();
// create the initial discretization scheme
DiscretizationScheme discretizationScheme = new DiscretizationScheme(new Interval(minValue, maxValue, true, true));
double globalCAIM = 0;
// performe the iterative search for the best intervals
int numInsertedBounds = 0;
double currentCAIM = 0;
// create subExec for inserted bounds
ExecutionContext subExecBounds = subExecPerColumn.createSubExecutionContext(0.9);
while (currentCAIM > globalCAIM || numInsertedBounds < m_classValues.length - 1) {
subExecPerColumn.checkCanceled();
// create subExec for counting
ExecutionContext subExecCount = subExecBounds.createSubExecutionContext(1.0D / m_classValues.length);
// LOGGER.debug("Inserted bounds: " + numInsertedBounds);
// LOGGER.debug("intervall boundaries: " +
// allIntervalBoundaries);
// for all possible interval boundaries
// insert each one, calculate the caim value and add
// the one with the biggest caim
LinkedDouble intervalBoundary = allIntervalBoundaries.m_next;
currentCAIM = 0;
LinkedDouble bestBoundary = null;
long currentCountedBoundaries = 0;
while (intervalBoundary != null) {
subExecPerColumn.checkCanceled();
// set progress
currentCountedBoundaries++;
subExecCount.setProgress((double) currentCountedBoundaries / (double) boundaryScheme.getNumBoundaries(), "Count for possible boundary " + currentCountedBoundaries + " of " + boundaryScheme.getNumBoundaries());
// LOGGER.debug("current caim: " + currentCAIM);
DiscretizationScheme tentativeDS = new DiscretizationScheme(discretizationScheme);
tentativeDS.insertBound(intervalBoundary.m_value);
// create the quanta matrix
QuantaMatrix2D quantaMatrix = new QuantaMatrix2D(tentativeDS, m_classValueToIndexMap);
// pass the data for filling the matrix
quantaMatrix.countData(data, columnIndex, m_classifyColumnIndex);
// calculate the caim
double caim = quantaMatrix.calculateCaim();
if (caim > currentCAIM) {
currentCAIM = caim;
bestBoundary = intervalBoundary;
}
intervalBoundary = intervalBoundary.m_next;
}
// if there is no best boundary, break the first while loop
if (bestBoundary == null) {
break;
}
// in this case accept the best discretization scheme
if (currentCAIM > globalCAIM || numInsertedBounds < m_classValues.length) {
int numIntervals = discretizationScheme.getNumIntervals();
discretizationScheme.insertBound(bestBoundary.m_value);
// remove the linked list element from the list
bestBoundary.remove();
globalCAIM = currentCAIM;
if (numIntervals < discretizationScheme.getNumIntervals()) {
numInsertedBounds++;
subExecPerColumn.setProgress("Inserted bound " + numInsertedBounds);
// LOGGER.debug("Inserted boundary: "
// + bestBoundary.m_value);
} else {
throw new IllegalStateException("Only usefull bounds should be inserted: " + bestBoundary.m_value);
}
}
subExecCount.setProgress(1.0D);
}
resultSchemes[currentColumn] = discretizationScheme;
subExecBounds.setProgress(1.0D);
// ensure the full progress is set for this iteration
subExecPerColumn.setProgress(1.0D);
currentColumn++;
}
// set the model
DataTableSpec modelSpec = createModelSpec(m_includedColumnNames, data.getDataTableSpec());
m_discretizationModel = new DiscretizationModel(resultSchemes, modelSpec);
// create an output table that replaces the included columns by
// interval values
BufferedDataTable resultTable = createResultTable(exec, data, m_discretizationModel);
// log the runtime of the execute method
long runtime = System.currentTimeMillis() - startTime;
LOGGER.debug("Binning runtime: " + (runtime / 1000.0) + " sec.");
return new PortObject[] { resultTable, m_discretizationModel };
}
use of org.knime.core.data.DataColumnDomain in project knime-core by knime.
the class NominalTable method computeValues.
/**
* Finds all possible values based on a table and a number of given column
* indices by iterating through the table.
*
* @param table ihe table to get values from
* @param columnIndex an array of sorted column indices
* @param exec an object to check if user canceled
* @return a modified table spec containing all possible values
* @throws NullPointerException if the table is <code>null</code>
* @throws IllegalArgumentException if column indices are not sorted
* @throws IndexOutOfBoundsException if a column index is out of range
* @throws CanceledExecutionException if user canceled operation
*/
public static final DataTableSpec computeValues(final BufferedDataTable table, final ExecutionMonitor exec, final int... columnIndex) throws CanceledExecutionException {
DataTableSpec oldSpec = table.getDataTableSpec();
// keep all possible values for each column (index)
@SuppressWarnings("unchecked") Set<DataCell>[] set = new Set[columnIndex.length];
HashSet<Integer> hash = new HashSet<Integer>();
for (int c = 0; c < columnIndex.length; c++) {
if (columnIndex[c] == -1) {
throw new IllegalArgumentException("Column " + columnIndex[c] + " not found.");
}
if (hash.contains(columnIndex[c])) {
throw new IllegalArgumentException("Column indices " + " contain duplicates: " + c);
}
if (c > 0 && columnIndex[c - 1] >= columnIndex[c]) {
throw new IllegalArgumentException("Column indices are " + "not sorted.");
}
hash.add(columnIndex[c]);
set[c] = new HashSet<DataCell>();
}
// overall rows in the table
long rowCount = 0;
for (DataRow row : table) {
// get value for column indices
for (int c = 0; c < columnIndex.length; c++) {
DataCell cell = row.getCell(columnIndex[c]);
// adds only each value once
set[c].add(cell);
}
if (exec != null) {
// throws exception if user canceled
exec.checkCanceled();
exec.setProgress((double) ++rowCount / table.size(), "" + row.getKey());
}
}
DataColumnSpec[] newColSpecs = new DataColumnSpec[oldSpec.getNumColumns()];
// index within the set of possible values
int idx = 0;
for (int i = 0; i < newColSpecs.length; i++) {
DataColumnSpec oldColSpec = oldSpec.getColumnSpec(i);
if (hash.contains(i)) {
DataColumnSpecCreator creator = new DataColumnSpecCreator(oldColSpec);
DataCell lower = null;
DataCell upper = null;
if (oldColSpec.getDomain().hasBounds()) {
lower = oldColSpec.getDomain().getLowerBound();
upper = oldColSpec.getDomain().getUpperBound();
} else {
// TODO DoubleValue is to restrict
if (oldColSpec.getType().isCompatible(DoubleValue.class)) {
TreeSet<DataCell> tSet = new TreeSet<DataCell>(oldColSpec.getType().getComparator());
tSet.addAll(set[idx]);
lower = tSet.first();
upper = tSet.last();
}
}
DataColumnDomain dom = new DataColumnDomainCreator(set[idx], lower, upper).createDomain();
creator.setDomain(dom);
newColSpecs[i] = creator.createSpec();
idx++;
} else {
newColSpecs[i] = oldColSpec;
}
}
// create new table spec along with all column specs
return new DataTableSpec(newColSpecs);
}
use of org.knime.core.data.DataColumnDomain in project knime-core by knime.
the class MissingValueHandlingTable method createTableSpecPrivate.
/* private helper that assumes the ColSetting to have the right format. */
private static DataTableSpec createTableSpecPrivate(final DataTableSpec spec, final ColSetting[] sets) {
assert (spec.getNumColumns() == sets.length);
DataColumnSpec[] newSpecs = new DataColumnSpec[sets.length];
for (int i = 0; i < sets.length; i++) {
DataColumnSpec colSpec = spec.getColumnSpec(i);
DataColumnSpec newSpec = colSpec;
if (sets[i].getMethod() == ColSetting.METHOD_FIX_VAL) {
DataColumnDomain dom = colSpec.getDomain();
Comparator<DataCell> comp = colSpec.getType().getComparator();
DataCell fixCell = sets[i].getFixCell();
boolean changed = false;
DataCell l = dom.getLowerBound();
// (but rather be null). It may happen anyway, we catch it here
if (l != null && !l.isMissing() && (comp.compare(fixCell, l) < 0)) {
changed = true;
l = fixCell;
}
DataCell u = dom.getUpperBound();
if (u != null && !u.isMissing() && (comp.compare(fixCell, u) > 0)) {
changed = true;
u = fixCell;
}
Set<DataCell> vals = dom.getValues();
if (vals != null && !vals.contains(fixCell)) {
changed = true;
vals = new LinkedHashSet<DataCell>(vals);
vals.add(fixCell);
}
if (changed) {
DataColumnDomain newDom = new DataColumnDomainCreator(vals, l, u).createDomain();
DataColumnSpecCreator c = new DataColumnSpecCreator(colSpec);
c.setDomain(newDom);
newSpec = c.createSpec();
}
}
newSpecs[i] = newSpec;
}
return new DataTableSpec(newSpecs);
}
Aggregations