use of org.knime.core.data.DataValueComparator in project knime-core by knime.
the class StatisticsTable method calculateAllMoments.
/**
* Calculates <b>all the statistical moments in one pass </b>. After the
* call of this operation, the statistical moments can be obtained very fast
* from all the other methods.
*
* @param rowCount Row count of table for progress, may be NaN if unknown.
* @param exec object to check with if user canceled the operation
* @throws CanceledExecutionException if user canceled
* @throws IllegalArgumentException if rowCount argument < 0
*/
protected void calculateAllMoments(final double rowCount, final ExecutionMonitor exec) throws CanceledExecutionException {
if (rowCount < 0.0) {
throw new IllegalArgumentException("rowCount argument must not < 0: " + rowCount);
}
DataTableSpec origSpec = m_table.getDataTableSpec();
int numOfCols = origSpec.getNumColumns();
// the number of non-missing cells in each column
int[] validCount = new int[numOfCols];
double[] sumsquare = new double[numOfCols];
final DataValueComparator[] comp = new DataValueComparator[numOfCols];
for (int i = 0; i < numOfCols; i++) {
sumsquare[i] = 0.0;
validCount[i] = 0;
comp[i] = origSpec.getColumnSpec(i).getType().getComparator();
assert comp[i] != null;
}
int nrRows = 0;
for (RowIterator rowIt = m_table.iterator(); rowIt.hasNext(); nrRows++) {
DataRow row = rowIt.next();
if (exec != null) {
double prog = Double.isNaN(rowCount) ? 0.0 : nrRows / rowCount;
exec.setProgress(prog, "Calculating statistics, processing row " + (nrRows + 1) + " (\"" + row.getKey() + "\")");
// throws exception if user canceled
exec.checkCanceled();
}
for (int c = 0; c < numOfCols; c++) {
final DataCell cell = row.getCell(c);
if (!(cell.isMissing())) {
// keep the min and max for each column
if ((m_minValues[c] == null) || (comp[c].compare(cell, m_minValues[c]) < 0)) {
m_minValues[c] = cell;
}
if ((m_maxValues[c] == null) || (comp[c].compare(m_maxValues[c], cell) < 0)) {
m_maxValues[c] = cell;
}
// for double columns we calc the sum (for the mean calc)
DataType type = origSpec.getColumnSpec(c).getType();
if (type.isCompatible(DoubleValue.class)) {
double d = ((DoubleValue) cell).getDoubleValue();
if (Double.isNaN(m_sum[c])) {
m_sum[c] = d;
} else {
m_sum[c] += d;
}
sumsquare[c] += d * d;
validCount[c]++;
}
} else {
m_missingValueCnt[c]++;
}
}
calculateMomentInSubClass(row);
}
m_nrRows = nrRows;
for (int j = 0; j < numOfCols; j++) {
// missing values
if (validCount[j] == 0 || m_minValues[j] == null) {
DataCell mc = DataType.getMissingCell();
m_minValues[j] = mc;
m_maxValues[j] = mc;
m_meanValues[j] = Double.NaN;
m_varianceValues[j] = Double.NaN;
} else {
m_meanValues[j] = m_sum[j] / validCount[j];
if (validCount[j] > 1) {
m_varianceValues[j] = (sumsquare[j] - ((m_sum[j] * m_sum[j]) / validCount[j])) / (validCount[j] - 1);
} else {
m_varianceValues[j] = 0.0;
}
// round-off errors resulting in negative variance values
if (m_varianceValues[j] < 0.0 && m_varianceValues[j] > -1.0E8) {
m_varianceValues[j] = 0.0;
}
assert m_varianceValues[j] >= 0.0 : "Variance cannot be negative (column \"" + origSpec.getColumnSpec(j).getName() + "\": " + m_varianceValues[j];
}
}
// compute resulting table spec
int nrCols = m_table.getDataTableSpec().getNumColumns();
DataColumnSpec[] cSpec = new DataColumnSpec[nrCols];
for (int c = 0; c < nrCols; c++) {
DataColumnSpec s = m_table.getDataTableSpec().getColumnSpec(c);
// we create domains with our bounds.
Set<DataCell> values = (s.getDomain() == null ? null : s.getDomain().getValues());
DataColumnDomain newDomain = new DataColumnDomainCreator(values, (m_minValues[c] == null || m_minValues[c].isMissing()) ? null : m_minValues[c], (m_maxValues[c] == null || m_maxValues[c].isMissing()) ? null : m_maxValues[c]).createDomain();
DataColumnSpecCreator creator = new DataColumnSpecCreator(s);
creator.setDomain(newDomain);
cSpec[c] = creator.createSpec();
}
m_tSpec = new DataTableSpec(cSpec);
}
use of org.knime.core.data.DataValueComparator in project knime-core by knime.
the class AccuracyScorerNodeModel method sort.
/**
* @param order The cells to sort.
*/
private void sort(final DataCell[] order) {
if (order.length == 0) {
return;
}
DataType type = order[0].getType();
for (DataCell dataCell : order) {
type = DataType.getCommonSuperType(type, dataCell.getType());
}
final Comparator<DataCell> comparator;
switch(m_sortingStrategy) {
case InsertionOrder:
if (m_sortingReversed) {
reverse(order);
}
return;
case Unsorted:
return;
case Lexical:
if (StringCell.TYPE.isASuperTypeOf(type)) {
Comparator<String> stringComparator;
Collator instance = Collator.getInstance();
// do not try to combine characters
instance.setDecomposition(Collator.NO_DECOMPOSITION);
// case and accents matter.
instance.setStrength(Collator.IDENTICAL);
@SuppressWarnings("unchecked") Comparator<String> collator = (Comparator<String>) (Comparator<?>) instance;
stringComparator = collator;
comparator = new StringValueComparator(stringComparator);
} else if (DoubleCell.TYPE.isASuperTypeOf(type)) {
comparator = new DataValueComparator() {
@Override
protected int compareDataValues(final DataValue v1, final DataValue v2) {
String s1 = v1.toString();
String s2 = v2.toString();
return s1.compareTo(s2);
}
};
} else {
throw new IllegalStateException("Lexical sorting strategy is not supported.");
}
break;
case Numeric:
if (DoubleCell.TYPE.isASuperTypeOf(type)) {
comparator = type.getComparator();
} else {
throw new IllegalStateException("Numerical sorting strategy is not supported.");
}
break;
default:
throw new IllegalStateException("Unrecognized sorting strategy: " + m_sortingStrategy);
}
Arrays.sort(order, comparator);
if (m_sortingReversed) {
reverse(order);
}
}
use of org.knime.core.data.DataValueComparator in project knime-core by knime.
the class ColumnRowFilterPanel method boundsChanged.
/**
* Called when user changes the values for the lower or upper bounds.
*/
protected void boundsChanged() {
// check if the entered value somehow goes along with the selected col.
setErrMsg("");
if (m_tSpec == null) {
return;
}
if (getSelectedColumnName() == null) {
return;
}
if (!m_useRange.isSelected()) {
return;
}
DataCell lowBound = null;
DataCell hiBound = null;
try {
lowBound = getLowerBoundCell();
hiBound = getUpperBoundCell();
} catch (InvalidSettingsException ise) {
setErrMsg(ise.getMessage());
return;
}
if ((lowBound == null) && (hiBound == null)) {
setErrMsg("Specify at least one range boundary");
return;
}
if ((lowBound != null) && (hiBound != null)) {
DataValueComparator comp;
comp = DataType.getCommonSuperType(lowBound.getType(), hiBound.getType()).getComparator();
if (comp.compare(hiBound, lowBound) == -1) {
setErrMsg("The lower bound must be smaller than the" + " upper bound");
return;
}
}
if (((lowBound != null) && (lowBound instanceof StringCell)) || ((hiBound != null) && (hiBound instanceof StringCell))) {
setErrMsg("Warning: String comparison is used for " + "range checking. May not work as expected!");
}
}
use of org.knime.core.data.DataValueComparator in project knime-core by knime.
the class BigGroupByTable method createGroupByTable.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable createGroupByTable(final ExecutionContext exec, final BufferedDataTable table, final DataTableSpec resultSpec, final int[] groupColIdx) throws CanceledExecutionException {
LOGGER.debug("Entering createGroupByTable(exec, table) " + "of class BigGroupByTable.");
final DataTableSpec origSpec = table.getDataTableSpec();
// sort the data table in order to process the input table chunk wise
final BufferedDataTable sortedTable;
final ExecutionContext groupExec;
final DataValueComparator[] comparators;
if (groupColIdx.length < 1) {
sortedTable = table;
groupExec = exec;
comparators = new DataValueComparator[0];
} else {
final ExecutionContext sortExec = exec.createSubExecutionContext(0.6);
exec.setMessage("Sorting input table...");
sortedTable = sortTable(sortExec, table, getGroupCols());
sortExec.setProgress(1.0);
groupExec = exec.createSubExecutionContext(0.4);
comparators = new DataValueComparator[groupColIdx.length];
for (int i = 0, length = groupColIdx.length; i < length; i++) {
final DataColumnSpec colSpec = origSpec.getColumnSpec(groupColIdx[i]);
comparators[i] = colSpec.getType().getComparator();
}
}
final BufferedDataContainer dc = exec.createDataContainer(resultSpec);
exec.setMessage("Creating groups");
final DataCell[] previousGroup = new DataCell[groupColIdx.length];
final DataCell[] currentGroup = new DataCell[groupColIdx.length];
final MutableInteger groupCounter = new MutableInteger(0);
boolean firstRow = true;
final double numOfRows = sortedTable.size();
long rowCounter = 0;
// In the rare case that the DataCell comparator return 0 for two
// data cells that are not equal we have to maintain a map with all
// rows with equal cells in the group columns per chunk.
// This variable stores for each chunk these members. A chunk consists
// of rows which return 0 for the pairwise group value comparison.
// Usually only equal data cells return 0 when compared with each other
// but in rare occasions also data cells that are NOT equal return 0 when
// compared to each other
// (such as cells that contain chemical structures).
// In this rare case this map will contain for each group of data cells
// that are pairwise equal in the chunk a separate entry.
final Map<GroupKey, Pair<ColumnAggregator[], Set<RowKey>>> chunkMembers = new LinkedHashMap<>(3);
boolean logUnusualCells = true;
String groupLabel = "";
// cannot put init to the constructor, as the super() constructor directly calls the current function
initMissingValuesMap();
for (final DataRow row : sortedTable) {
// fetch the current group column values
for (int i = 0, length = groupColIdx.length; i < length; i++) {
currentGroup[i] = row.getCell(groupColIdx[i]);
}
if (firstRow) {
groupLabel = createGroupLabelForProgress(currentGroup);
System.arraycopy(currentGroup, 0, previousGroup, 0, currentGroup.length);
firstRow = false;
}
// group column data cells
if (!sameChunk(comparators, previousGroup, currentGroup)) {
groupLabel = createGroupLabelForProgress(currentGroup);
createTableRows(dc, chunkMembers, groupCounter);
// set the current group as previous group
System.arraycopy(currentGroup, 0, previousGroup, 0, currentGroup.length);
if (logUnusualCells && chunkMembers.size() > 1) {
// cause the problem
if (LOGGER.isEnabledFor(LEVEL.INFO)) {
final StringBuilder buf = new StringBuilder();
buf.append("Data chunk with ");
buf.append(chunkMembers.size());
buf.append(" members occured in groupby node. " + "Involved classes are: ");
final GroupKey key = chunkMembers.keySet().iterator().next();
for (final DataCell cell : key.getGroupVals()) {
buf.append(cell.getClass().getCanonicalName());
buf.append(", ");
}
LOGGER.info(buf.toString());
}
logUnusualCells = false;
}
// reset the chunk members map
chunkMembers.clear();
}
// process the row as one of the members of the current chunk
Pair<ColumnAggregator[], Set<RowKey>> member = chunkMembers.get(new GroupKey(currentGroup));
if (member == null) {
Set<RowKey> rowKeys;
if (isEnableHilite()) {
rowKeys = new HashSet<>();
} else {
rowKeys = Collections.emptySet();
}
member = new Pair<>(cloneColumnAggregators(), rowKeys);
final DataCell[] groupKeys = new DataCell[currentGroup.length];
System.arraycopy(currentGroup, 0, groupKeys, 0, currentGroup.length);
chunkMembers.put(new GroupKey(groupKeys), member);
}
// compute the current row values
for (final ColumnAggregator colAggr : member.getFirst()) {
final int colIdx = origSpec.findColumnIndex(colAggr.getOriginalColName());
colAggr.getOperator(getGlobalSettings()).compute(row, colIdx);
}
if (isEnableHilite()) {
member.getSecond().add(row.getKey());
}
groupExec.checkCanceled();
groupExec.setProgress(++rowCounter / numOfRows, groupLabel);
}
// create the final row for the last chunk after processing the last
// table row
createTableRows(dc, chunkMembers, groupCounter);
dc.close();
return dc.getTable();
}
use of org.knime.core.data.DataValueComparator in project knime-core by knime.
the class RowComparator method compareCells.
private int compareCells(final DataRow dr1, final DataRow dr2, final int i) {
int cellComparison;
final DataCell c1 = dr1.getCell(m_indices[i]);
final DataCell c2 = dr2.getCell(m_indices[i]);
final boolean c1Missing = c1.isMissing();
final boolean c2Missing = c2.isMissing();
if (m_sortMissingsToEnd && (c1Missing || c2Missing)) {
return sortMissingsToEnd(i, c1Missing, c2Missing);
} else {
final DataValueComparator comp = m_colComparators[i];
cellComparison = comp.compare(c1, c2);
}
return cellComparison;
}
Aggregations