use of org.knime.core.data.MissingCell in project knime-core by knime.
the class BoxplotCalculator method calculateMultipleConditional.
/**
* Calculates statistics for a conditional box plot.
* @param table the data table
* @param catCol the column with the category values
* @param numCol the numeric column
* @param exec an execution context
* @return A linked hash map with BoxplotStatistics for each category
* @throws CanceledExecutionException when the user cancels the execution
* @throws InvalidSettingsException when the category column has no domain values
*/
public LinkedHashMap<String, LinkedHashMap<String, BoxplotStatistics>> calculateMultipleConditional(final BufferedDataTable table, final String catCol, final String[] numCol, final ExecutionContext exec) throws CanceledExecutionException, InvalidSettingsException {
DataTableSpec spec = table.getSpec();
int catColIdx = spec.findColumnIndex(catCol);
int[] numColIdxs = new int[numCol.length];
for (int i = 0; i < numCol.length; i++) {
numColIdxs[i] = spec.findColumnIndex(numCol[i]);
}
Set<DataCell> valuesSet = spec.getColumnSpec(catColIdx).getDomain().getValues();
if (valuesSet == null) {
throw new InvalidSettingsException("Selected category column has no domain values");
}
ArrayList<DataCell> vals = new ArrayList<>(valuesSet);
Collections.sort(vals, new Comparator<DataCell>() {
@Override
public int compare(final DataCell o1, final DataCell o2) {
return o1.toString().compareTo(o2.toString());
}
});
// add Missing values class as it is never in specification
vals.add(new MissingCell(null));
// we need to have clear names, otherwise Missing values class will be taken as "?"
ArrayList<String> catNames = new ArrayList<>(vals.size());
for (DataCell cell : vals) {
catNames.add(cell.isMissing() ? MISSING_VALUES_CLASS : cell.toString());
}
LinkedHashMap<String, LinkedHashMap<String, DataContainer>> containers = new LinkedHashMap<>();
m_ignoredMissVals = new LinkedHashMap<>();
for (int i = 0; i < numCol.length; i++) {
LinkedHashMap<String, DataContainer> map = new LinkedHashMap<>();
LinkedHashMap<String, Long> missValMap = new LinkedHashMap<>();
for (DataCell c : vals) {
String name = c.isMissing() ? MISSING_VALUES_CLASS : c.toString();
map.put(name, exec.createDataContainer(new DataTableSpec(new String[] { "col" }, new DataType[] { DoubleCell.TYPE })));
missValMap.put(name, 0L);
}
containers.put(numCol[i], map);
m_ignoredMissVals.put(numCol[i], missValMap);
}
ExecutionContext subExec = exec.createSubExecutionContext(0.7);
// long[][] ignoredMissVals = new long[numCol.length][vals.size()]; // count missing values per data col per class
long count = 0;
final long numOfRows = table.size();
for (DataRow row : table) {
exec.checkCanceled();
subExec.setProgress(count++ / (double) numOfRows);
DataCell catCell = row.getCell(catColIdx);
String catName = catCell.isMissing() ? MISSING_VALUES_CLASS : catCell.toString();
for (int i = 0; i < numCol.length; i++) {
DataCell cell = row.getCell(numColIdxs[i]);
if (!cell.isMissing()) {
containers.get(numCol[i]).get(catName).addRowToTable(new DefaultRow(row.getKey(), cell));
} else {
// increment missing values
LinkedHashMap<String, Long> missValMap = m_ignoredMissVals.get(numCol[i]);
missValMap.replace(catName, missValMap.get(catName) + 1);
}
}
}
LinkedHashMap<String, LinkedHashMap<String, BoxplotStatistics>> statsMap = new LinkedHashMap<>();
excludedClasses = new LinkedHashMap<>();
List<String> colList = Arrays.asList(numCol);
ExecutionContext subExec2 = exec.createSubExecutionContext(1.0);
int count2 = 0;
for (Entry<String, LinkedHashMap<String, DataContainer>> entry : containers.entrySet()) {
exec.checkCanceled();
subExec2.setProgress(count2++ / (double) containers.size());
LinkedHashMap<String, DataContainer> containers2 = entry.getValue();
LinkedHashMap<String, BoxplotStatistics> colStats = new LinkedHashMap<String, BoxplotStatistics>();
String colName = entry.getKey();
List<String> excludedColClassesList = new ArrayList<>();
LinkedHashMap<String, Long> ignoredColMissVals = new LinkedHashMap<>();
for (Entry<String, DataContainer> entry2 : containers2.entrySet()) {
Set<Outlier> extremeOutliers = new HashSet<Outlier>();
Set<Outlier> mildOutliers = new HashSet<Outlier>();
entry2.getValue().close();
String catName = entry2.getKey();
BufferedDataTable catTable = (BufferedDataTable) entry2.getValue().getTable();
LinkedHashMap<String, Long> missValMap = m_ignoredMissVals.get(colName);
if (catTable.size() == 0) {
if (!(catName.equals(MISSING_VALUES_CLASS) && missValMap.get(catName) == 0)) {
// we should add missing values to this list, only if they were there
excludedColClassesList.add(catName);
}
missValMap.remove(catName);
continue;
} else {
if (missValMap.get(catName) == 0) {
missValMap.remove(catName);
}
}
SortedTable st = new SortedTable(catTable, new Comparator<DataRow>() {
@Override
public int compare(final DataRow o1, final DataRow o2) {
double d1 = ((DoubleValue) o1.getCell(0)).getDoubleValue();
double d2 = ((DoubleValue) o2.getCell(0)).getDoubleValue();
if (d1 == d2) {
return 0;
} else {
return d1 < d2 ? -1 : 1;
}
}
}, false, exec);
double min = 0, max = 0, q1 = 0, q3 = 0, median = 0;
boolean dq1 = catTable.size() % 4 == 0;
long q1Idx = catTable.size() / 4;
boolean dq3 = 3 * catTable.size() % 4 == 0;
long q3Idx = 3 * catTable.size() / 4;
boolean dMedian = catTable.size() % 2 == 0;
long medianIdx = catTable.size() / 2;
int counter = 0;
for (DataRow row : st) {
double val = ((DoubleValue) row.getCell(0)).getDoubleValue();
if (counter == 0) {
min = val;
}
if (counter == catTable.size() - 1) {
max = val;
}
if (counter == q1Idx - 1 && dq1) {
q1 = val;
}
if (counter == q1Idx || (counter == 0 && st.size() <= 3)) {
if (dq1) {
q1 = (q1 + val) / 2.0;
} else {
q1 = val;
}
}
if (counter == medianIdx - 1 && dMedian) {
median = val;
}
if (counter == medianIdx) {
if (dMedian) {
median = (median + val) / 2;
} else {
median = val;
}
}
if (counter == q3Idx - 1 && dq3) {
q3 = val;
}
if (counter == q3Idx || (counter == st.size() - 1 && st.size() <= 3)) {
if (dq3) {
q3 = (q3 + val) / 2.0;
} else {
q3 = val;
}
}
counter++;
}
double iqr = q3 - q1;
double lowerWhisker = min;
double upperWhisker = max;
double upperWhiskerFence = q3 + (1.5 * iqr);
double lowerWhiskerFence = q1 - (1.5 * iqr);
double lowerFence = q1 - (3 * iqr);
double upperFence = q3 + (3 * iqr);
for (DataRow row : st) {
double value = ((DoubleValue) row.getCell(0)).getDoubleValue();
String rowKey = row.getKey().getString();
if (value < lowerFence) {
extremeOutliers.add(new Outlier(value, rowKey));
} else if (value < lowerWhiskerFence) {
mildOutliers.add(new Outlier(value, rowKey));
} else if (lowerWhisker < lowerWhiskerFence && value >= lowerWhiskerFence) {
lowerWhisker = value;
} else if (value <= upperWhiskerFence) {
upperWhisker = value;
} else if (value > upperFence) {
extremeOutliers.add(new Outlier(value, rowKey));
} else if (value > upperWhiskerFence) {
mildOutliers.add(new Outlier(value, rowKey));
}
}
colStats.put(catName, new BoxplotStatistics(mildOutliers, extremeOutliers, min, max, lowerWhisker, q1, median, q3, upperWhisker));
}
statsMap.put(colName, colStats);
// missing values part
String[] excludedColClasses = excludedColClassesList.toArray(new String[excludedColClassesList.size()]);
excludedClasses.put(colName, excludedColClasses);
}
return statsMap;
}
use of org.knime.core.data.MissingCell in project knime-core by knime.
the class DecisionTreeNode method getWinnerAndClasscounts.
/**
* Classify a new pattern given as a row of values. Returns the winning
* class and the class counts of all classes.
*
* @param row input pattern
* @param spec the corresponding table spec
* @return class of pattern the decision tree predicts
* @throws Exception if something went wrong (unknown attribute for example)
*/
public final Pair<DataCell, LinkedHashMap<DataCell, Double>> getWinnerAndClasscounts(final DataRow row, final DataTableSpec spec) throws Exception {
DecisionTreeNode winnerNode = getWinnerNode(row, spec);
LinkedHashMap<DataCell, Double> classCounts;
DataCell winner = null;
if (winnerNode == null) {
// Missing value encountered, return null and empty map
classCounts = new LinkedHashMap<DataCell, Double>();
winner = new MissingCell("Error in decision tree prediction");
} else if (winnerNode instanceof DecisionTreeNodeSplit) {
// We stopped before reaching a leaf (eg due to a missing value)
classCounts = ((DecisionTreeNodeSplit) winnerNode).getNodeClassWeights();
double max = 0;
for (DataCell key : classCounts.keySet()) {
Double val = classCounts.get(key);
if (val != null && val > max) {
max = val;
winner = key;
}
}
} else {
// We reached a leaf node, return its score and class counts
DecisionTreeNodeLeaf leaf = (DecisionTreeNodeLeaf) winnerNode;
winner = leaf.getMajorityClass();
classCounts = leaf.getClassCounts();
}
return new Pair<DataCell, LinkedHashMap<DataCell, Double>>(winner, classCounts);
}
use of org.knime.core.data.MissingCell in project knime-core by knime.
the class DBRowIteratorImpl method next.
/**
* {@inheritDoc}
*/
@Override
public DataRow next() {
DataCell[] cells = new DataCell[m_spec.getNumColumns()];
for (int i = 0; i < cells.length; i++) {
DataType type = m_spec.getColumnSpec(i).getType();
int dbType = Types.NULL;
final DataCell cell;
try {
dbType = m_result.getMetaData().getColumnType(i + 1);
if (type.isCompatible(BooleanValue.class)) {
switch(dbType) {
// all types that can be interpreted as boolean
case Types.BIT:
case Types.BOOLEAN:
cell = readBoolean(i);
break;
default:
cell = readBoolean(i);
}
} else if (type.isCompatible(IntValue.class)) {
switch(dbType) {
// all types that can be interpreted as integer
case Types.TINYINT:
cell = readByte(i);
break;
case Types.SMALLINT:
cell = readShort(i);
break;
case Types.INTEGER:
cell = readInt(i);
break;
default:
cell = readInt(i);
}
} else if (type.isCompatible(LongValue.class)) {
switch(dbType) {
// all types that can be interpreted as long
case Types.BIGINT:
cell = readLong(i);
break;
default:
cell = readLong(i);
}
} else if (type.isCompatible(DoubleValue.class)) {
switch(dbType) {
// all types that can be interpreted as double
case Types.REAL:
cell = readFloat(i);
break;
default:
cell = readDouble(i);
}
} else if (type.isCompatible(DateAndTimeValue.class)) {
switch(dbType) {
case Types.DATE:
cell = readDate(i);
break;
case Types.TIME:
cell = readTime(i);
break;
case Types.TIMESTAMP:
cell = readTimestamp(i);
break;
default:
cell = readString(i);
}
} else if (type.isCompatible(BinaryObjectDataValue.class)) {
switch(dbType) {
case Types.BLOB:
DataCell c = null;
try {
c = readBlob(i);
} catch (SQLException ex) {
// probably not supported (e.g. SQLite), therefore try another method
c = readBytesAsBLOB(i);
}
cell = c;
break;
case Types.LONGVARCHAR:
case Types.LONGNVARCHAR:
cell = readAsciiStream(i);
break;
case Types.BINARY:
case Types.LONGVARBINARY:
case Types.VARBINARY:
cell = readBinaryStream(i);
break;
default:
cell = readString(i);
}
} else {
switch(dbType) {
case Types.CLOB:
cell = readClob(i);
break;
case Types.ARRAY:
cell = readArray(i);
break;
case Types.CHAR:
case Types.VARCHAR:
case Types.LONGVARCHAR:
cell = readString(i);
break;
case Types.VARBINARY:
cell = readBytesAsString(i);
break;
case Types.REF:
cell = readRef(i);
break;
case Types.NCHAR:
case Types.NVARCHAR:
case Types.LONGNVARCHAR:
cell = readNString(i);
break;
case Types.NCLOB:
cell = readNClob(i);
break;
case Types.DATALINK:
cell = readURL(i);
break;
case Types.STRUCT:
case Types.JAVA_OBJECT:
cell = readObject(i);
break;
default:
cell = readObject(i);
break;
}
}
// finally set the new cell into the array of cells
cells[i] = cell;
} catch (SQLException sqle) {
handlerException("SQL Exception reading Object of type \"" + dbType + "\": ", sqle);
cells[i] = new MissingCell(sqle.getMessage());
} catch (IOException ioe) {
handlerException("I/O Exception reading Object of type \"" + dbType + "\": ", ioe);
cells[i] = new MissingCell(ioe.getMessage());
}
}
long rowId;
try {
rowId = m_result.getRow();
// Bug 2729: ResultSet#getRow return 0 if there is no row id
if (rowId <= 0 || !m_useDbRowId) {
// use row counter
rowId = m_rowCounter;
} else if (m_rowIdsStartWithZero) {
// first row in SQL always is 1, KNIME starts with 0
rowId--;
}
} catch (SQLException sqle) {
// ignored: use m_rowCounter
rowId = m_rowCounter;
}
m_rowCounter++;
return new DefaultRow(RowKey.createRowKey(rowId), cells);
}
use of org.knime.core.data.MissingCell in project knime-core by knime.
the class DoubleMovingAverageMissingCellHandler method getCell.
/**
* {@inheritDoc}
*/
@Override
public DataCell getCell(final RowKey key, final DataColumnWindow window) {
double sum = 0.0;
int count = 0;
for (int i = -m_lookbehindSize.getIntValue(); i < m_lookaheadSize.getIntValue() + 1; i++) {
DataCell cell = window.getNthCell(i);
if (cell != null && !cell.isMissing()) {
sum += ((DoubleValue) cell).getDoubleValue();
count++;
}
}
if (count > 0) {
return new DoubleCell(sum / count);
} else {
return new MissingCell("No cells for average calculation available");
}
}
use of org.knime.core.data.MissingCell in project knime-core by knime.
the class TestDataGenerator method createNumericAttributeColumnData.
public TreeOrdinaryNumericColumnData createNumericAttributeColumnData(final double[] values, final String name, final int attributeIndex) {
DataColumnSpec colSpec = new DataColumnSpecCreator(name, DoubleCell.TYPE).createSpec();
TreeOrdinaryNumericColumnDataCreator colCreator = new TreeOrdinaryNumericColumnDataCreator(colSpec);
for (int i = 0; i < values.length; i++) {
final RowKey key = RowKey.createRowKey((long) i);
if (Double.isNaN(values[i])) {
colCreator.add(key, new MissingCell(null));
} else {
colCreator.add(key, new DoubleCell(values[i]));
}
}
TreeOrdinaryNumericColumnData col = colCreator.createColumnData(0, m_config);
col.getMetaData().setAttributeIndex(attributeIndex);
return col;
}
Aggregations