use of org.knime.core.data.container.DataContainer in project knime-core by knime.
the class BoxPlotNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
if (inData[0] == null) {
return new BufferedDataTable[] {};
}
BufferedDataTable table = inData[0];
m_statistics = new LinkedHashMap<DataColumnSpec, double[]>();
m_mildOutliers = new LinkedHashMap<String, Map<Double, Set<RowKey>>>();
m_extremeOutliers = new LinkedHashMap<String, Map<Double, Set<RowKey>>>();
int colIdx = 0;
List<DataColumnSpec> outputColSpecs = new ArrayList<DataColumnSpec>();
double subProgress = 1.0 / getNumNumericColumns(table.getDataTableSpec());
for (DataColumnSpec colSpec : table.getDataTableSpec()) {
ExecutionContext colExec = exec.createSubExecutionContext(subProgress);
exec.checkCanceled();
if (colSpec.getType().isCompatible(DoubleValue.class)) {
double[] statistic = new double[SIZE];
outputColSpecs.add(colSpec);
List<String> col = new ArrayList<String>();
col.add(colSpec.getName());
ExecutionContext sortExec = colExec.createSubExecutionContext(0.75);
ExecutionContext findExec = colExec.createSubExecutionContext(0.25);
SortedTable sorted = new SortedTable(table, col, new boolean[] { true }, sortExec);
long currRowAbsolute = 0;
int currCountingRow = 1;
double lastValue = 1;
long nrOfRows = table.size();
boolean first = true;
for (DataRow row : sorted) {
exec.checkCanceled();
double rowProgress = currRowAbsolute / (double) table.size();
findExec.setProgress(rowProgress, "determining statistics for: " + table.getDataTableSpec().getColumnSpec(colIdx).getName());
if (row.getCell(colIdx).isMissing()) {
// asserts that the missing values are sorted at
// the top of the table
currRowAbsolute++;
nrOfRows--;
continue;
}
// get the first value = actually observed minimum
if (first) {
statistic[MIN] = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
// initialize the statistics with first value
// if the table is large enough it will be overriden
// this is just for the case of tables with < 5 rows
statistic[MEDIAN] = statistic[MIN];
statistic[LOWER_QUARTILE] = statistic[MIN];
statistic[UPPER_QUARTILE] = statistic[MIN];
first = false;
}
// get the last value = actually observed maximum
if (currRowAbsolute == table.size() - 1) {
statistic[MAX] = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
}
float medianPos = nrOfRows * 0.5f;
float lowerQuartilePos = nrOfRows * 0.25f;
float upperQuartilePos = nrOfRows * 0.75f;
if (currCountingRow == (int) Math.floor(lowerQuartilePos) + 1) {
if (lowerQuartilePos % 1 != 0) {
// get the row's value
statistic[LOWER_QUARTILE] = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
} else {
// calculate the mean between row and last row
double value = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
statistic[LOWER_QUARTILE] = (value + lastValue) / 2;
}
}
if (currCountingRow == (int) Math.floor(medianPos) + 1) {
if (medianPos % 1 != 0) {
// get the row's value
statistic[MEDIAN] = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
} else {
// calculate the mean between row and last row
double value = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
statistic[MEDIAN] = (value + lastValue) / 2;
}
}
if (currCountingRow == (int) Math.floor(upperQuartilePos) + 1) {
if (upperQuartilePos % 1 != 0) {
// get the row's value
statistic[UPPER_QUARTILE] = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
} else {
// calculate the mean between row and last row
double value = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
statistic[UPPER_QUARTILE] = (value + lastValue) / 2;
}
}
lastValue = ((DoubleValue) row.getCell(colIdx)).getDoubleValue();
currRowAbsolute++;
currCountingRow++;
}
double iqr = statistic[UPPER_QUARTILE] - statistic[LOWER_QUARTILE];
Map<Double, Set<RowKey>> mild = new LinkedHashMap<Double, Set<RowKey>>();
Map<Double, Set<RowKey>> extreme = new LinkedHashMap<Double, Set<RowKey>>();
// per default the whiskers are at min and max
double[] whiskers = new double[] { statistic[MIN], statistic[MAX] };
if (statistic[MIN] < (statistic[LOWER_QUARTILE] - (1.5 * iqr)) || statistic[MAX] > statistic[UPPER_QUARTILE] + (1.5 * iqr)) {
detectOutliers(sorted, iqr, new double[] { statistic[LOWER_QUARTILE], statistic[UPPER_QUARTILE] }, mild, extreme, whiskers, colIdx);
}
statistic[LOWER_WHISKER] = whiskers[0];
statistic[UPPER_WHISKER] = whiskers[1];
m_mildOutliers.put(colSpec.getName(), mild);
m_extremeOutliers.put(colSpec.getName(), extreme);
m_statistics.put(colSpec, statistic);
}
colIdx++;
}
DataContainer container = createOutputTable(exec, outputColSpecs);
// return a data array with just one row but with the data table spec
// for the column selection panel
m_array = new DefaultDataArray(table, 1, 2);
return new BufferedDataTable[] { exec.createBufferedDataTable(container.getTable(), exec) };
}
use of org.knime.core.data.container.DataContainer in project knime-core by knime.
the class BoxPlotNodeModel method createOutputTable.
private DataContainer createOutputTable(final ExecutionContext exec, final List<DataColumnSpec> outputColSpecs) {
DataTableSpec outSpec = createOutputSpec(outputColSpecs);
DataContainer container = exec.createDataContainer(outSpec);
String[] rowKeys = new String[SIZE];
rowKeys[MIN] = "Minimum";
rowKeys[LOWER_WHISKER] = "Smallest";
rowKeys[LOWER_QUARTILE] = "Lower Quartile";
rowKeys[MEDIAN] = "Median";
rowKeys[UPPER_QUARTILE] = "Upper Quartile";
rowKeys[UPPER_WHISKER] = "Largest";
rowKeys[MAX] = "Maximum";
for (int i = 0; i < SIZE; i++) {
DataCell[] cells = new DataCell[outputColSpecs.size()];
for (int j = 0; j < cells.length; j++) {
double[] stats = m_statistics.get(outputColSpecs.get(j));
cells[j] = new DoubleCell(stats[i]);
}
DataRow row = new DefaultRow(rowKeys[i], cells);
container.addRowToTable(row);
}
container.close();
return container;
}
use of org.knime.core.data.container.DataContainer in project knime-core by knime.
the class BoxplotCalculator method calculateMultipleConditional.
/**
* Calculates statistics for a conditional box plot.
* @param table the data table
* @param catCol the column with the category values
* @param numCol the numeric column
* @param exec an execution context
* @return A linked hash map with BoxplotStatistics for each category
* @throws CanceledExecutionException when the user cancels the execution
* @throws InvalidSettingsException when the category column has no domain values
*/
public LinkedHashMap<String, LinkedHashMap<String, BoxplotStatistics>> calculateMultipleConditional(final BufferedDataTable table, final String catCol, final String[] numCol, final ExecutionContext exec) throws CanceledExecutionException, InvalidSettingsException {
DataTableSpec spec = table.getSpec();
int catColIdx = spec.findColumnIndex(catCol);
int[] numColIdxs = new int[numCol.length];
for (int i = 0; i < numCol.length; i++) {
numColIdxs[i] = spec.findColumnIndex(numCol[i]);
}
Set<DataCell> valuesSet = spec.getColumnSpec(catColIdx).getDomain().getValues();
if (valuesSet == null) {
throw new InvalidSettingsException("Selected category column has no domain values");
}
ArrayList<DataCell> vals = new ArrayList<>(valuesSet);
Collections.sort(vals, new Comparator<DataCell>() {
@Override
public int compare(final DataCell o1, final DataCell o2) {
return o1.toString().compareTo(o2.toString());
}
});
// add Missing values class as it is never in specification
vals.add(new MissingCell(null));
// we need to have clear names, otherwise Missing values class will be taken as "?"
ArrayList<String> catNames = new ArrayList<>(vals.size());
for (DataCell cell : vals) {
catNames.add(cell.isMissing() ? MISSING_VALUES_CLASS : cell.toString());
}
LinkedHashMap<String, LinkedHashMap<String, DataContainer>> containers = new LinkedHashMap<>();
m_ignoredMissVals = new LinkedHashMap<>();
for (int i = 0; i < numCol.length; i++) {
LinkedHashMap<String, DataContainer> map = new LinkedHashMap<>();
LinkedHashMap<String, Long> missValMap = new LinkedHashMap<>();
for (DataCell c : vals) {
String name = c.isMissing() ? MISSING_VALUES_CLASS : c.toString();
map.put(name, exec.createDataContainer(new DataTableSpec(new String[] { "col" }, new DataType[] { DoubleCell.TYPE })));
missValMap.put(name, 0L);
}
containers.put(numCol[i], map);
m_ignoredMissVals.put(numCol[i], missValMap);
}
ExecutionContext subExec = exec.createSubExecutionContext(0.7);
// long[][] ignoredMissVals = new long[numCol.length][vals.size()]; // count missing values per data col per class
long count = 0;
final long numOfRows = table.size();
for (DataRow row : table) {
exec.checkCanceled();
subExec.setProgress(count++ / (double) numOfRows);
DataCell catCell = row.getCell(catColIdx);
String catName = catCell.isMissing() ? MISSING_VALUES_CLASS : catCell.toString();
for (int i = 0; i < numCol.length; i++) {
DataCell cell = row.getCell(numColIdxs[i]);
if (!cell.isMissing()) {
containers.get(numCol[i]).get(catName).addRowToTable(new DefaultRow(row.getKey(), cell));
} else {
// increment missing values
LinkedHashMap<String, Long> missValMap = m_ignoredMissVals.get(numCol[i]);
missValMap.replace(catName, missValMap.get(catName) + 1);
}
}
}
LinkedHashMap<String, LinkedHashMap<String, BoxplotStatistics>> statsMap = new LinkedHashMap<>();
excludedClasses = new LinkedHashMap<>();
List<String> colList = Arrays.asList(numCol);
ExecutionContext subExec2 = exec.createSubExecutionContext(1.0);
int count2 = 0;
for (Entry<String, LinkedHashMap<String, DataContainer>> entry : containers.entrySet()) {
exec.checkCanceled();
subExec2.setProgress(count2++ / (double) containers.size());
LinkedHashMap<String, DataContainer> containers2 = entry.getValue();
LinkedHashMap<String, BoxplotStatistics> colStats = new LinkedHashMap<String, BoxplotStatistics>();
String colName = entry.getKey();
List<String> excludedColClassesList = new ArrayList<>();
LinkedHashMap<String, Long> ignoredColMissVals = new LinkedHashMap<>();
for (Entry<String, DataContainer> entry2 : containers2.entrySet()) {
Set<Outlier> extremeOutliers = new HashSet<Outlier>();
Set<Outlier> mildOutliers = new HashSet<Outlier>();
entry2.getValue().close();
String catName = entry2.getKey();
BufferedDataTable catTable = (BufferedDataTable) entry2.getValue().getTable();
LinkedHashMap<String, Long> missValMap = m_ignoredMissVals.get(colName);
if (catTable.size() == 0) {
if (!(catName.equals(MISSING_VALUES_CLASS) && missValMap.get(catName) == 0)) {
// we should add missing values to this list, only if they were there
excludedColClassesList.add(catName);
}
missValMap.remove(catName);
continue;
} else {
if (missValMap.get(catName) == 0) {
missValMap.remove(catName);
}
}
SortedTable st = new SortedTable(catTable, new Comparator<DataRow>() {
@Override
public int compare(final DataRow o1, final DataRow o2) {
double d1 = ((DoubleValue) o1.getCell(0)).getDoubleValue();
double d2 = ((DoubleValue) o2.getCell(0)).getDoubleValue();
if (d1 == d2) {
return 0;
} else {
return d1 < d2 ? -1 : 1;
}
}
}, false, exec);
double min = 0, max = 0, q1 = 0, q3 = 0, median = 0;
boolean dq1 = catTable.size() % 4 == 0;
long q1Idx = catTable.size() / 4;
boolean dq3 = 3 * catTable.size() % 4 == 0;
long q3Idx = 3 * catTable.size() / 4;
boolean dMedian = catTable.size() % 2 == 0;
long medianIdx = catTable.size() / 2;
int counter = 0;
for (DataRow row : st) {
double val = ((DoubleValue) row.getCell(0)).getDoubleValue();
if (counter == 0) {
min = val;
}
if (counter == catTable.size() - 1) {
max = val;
}
if (counter == q1Idx - 1 && dq1) {
q1 = val;
}
if (counter == q1Idx || (counter == 0 && st.size() <= 3)) {
if (dq1) {
q1 = (q1 + val) / 2.0;
} else {
q1 = val;
}
}
if (counter == medianIdx - 1 && dMedian) {
median = val;
}
if (counter == medianIdx) {
if (dMedian) {
median = (median + val) / 2;
} else {
median = val;
}
}
if (counter == q3Idx - 1 && dq3) {
q3 = val;
}
if (counter == q3Idx || (counter == st.size() - 1 && st.size() <= 3)) {
if (dq3) {
q3 = (q3 + val) / 2.0;
} else {
q3 = val;
}
}
counter++;
}
double iqr = q3 - q1;
double lowerWhisker = min;
double upperWhisker = max;
double upperWhiskerFence = q3 + (1.5 * iqr);
double lowerWhiskerFence = q1 - (1.5 * iqr);
double lowerFence = q1 - (3 * iqr);
double upperFence = q3 + (3 * iqr);
for (DataRow row : st) {
double value = ((DoubleValue) row.getCell(0)).getDoubleValue();
String rowKey = row.getKey().getString();
if (value < lowerFence) {
extremeOutliers.add(new Outlier(value, rowKey));
} else if (value < lowerWhiskerFence) {
mildOutliers.add(new Outlier(value, rowKey));
} else if (lowerWhisker < lowerWhiskerFence && value >= lowerWhiskerFence) {
lowerWhisker = value;
} else if (value <= upperWhiskerFence) {
upperWhisker = value;
} else if (value > upperFence) {
extremeOutliers.add(new Outlier(value, rowKey));
} else if (value > upperWhiskerFence) {
mildOutliers.add(new Outlier(value, rowKey));
}
}
colStats.put(catName, new BoxplotStatistics(mildOutliers, extremeOutliers, min, max, lowerWhisker, q1, median, q3, upperWhisker));
}
statsMap.put(colName, colStats);
// missing values part
String[] excludedColClasses = excludedColClassesList.toArray(new String[excludedColClassesList.size()]);
excludedClasses.put(colName, excludedColClasses);
}
return statsMap;
}
use of org.knime.core.data.container.DataContainer in project knime-core by knime.
the class ColorExtractNodeModel method extractColorTable.
/**
* @param range
* @return
*/
private DataTable extractColorTable(final ColorModelRange range) {
DataTableSpec spec = createSpec(DoubleCell.TYPE);
DataContainer cnt = new DataContainer(spec);
RowKey[] keys = new RowKey[] { new RowKey("min"), new RowKey("max") };
Color[] clrs = new Color[] { range.getMinColor(), range.getMaxColor() };
double[] vals = new double[] { range.getMinValue(), range.getMaxValue() };
for (int i = 0; i < 2; i++) {
Color clr = clrs[i];
DataRow row = new DefaultRow(keys[i], new DoubleCell(vals[i]), new IntCell(clr.getRed()), new IntCell(clr.getGreen()), new IntCell(clr.getBlue()), new IntCell(clr.getAlpha()), new IntCell(clr.getRGB()));
cnt.addRowToTable(row);
}
cnt.close();
return cnt.getTable();
}
use of org.knime.core.data.container.DataContainer in project knime-core by knime.
the class ClusterNodeModel method execute.
/**
* Generate new clustering based on InputDataTable and specified number of
* clusters. Currently the objective function only looks for cluster centers
* that are extremely similar to the first n patterns...
*
* {@inheritDoc}
*/
@Override
protected PortObject[] execute(final PortObject[] data, final ExecutionContext exec) throws Exception {
// FIXME actually do something useful with missing values!
BufferedDataTable inData = (BufferedDataTable) data[0];
DataTableSpec spec = inData.getDataTableSpec();
// get dimension of feature space
m_dimension = inData.getDataTableSpec().getNumColumns();
HashMap<RowKey, Set<RowKey>> mapping = new HashMap<RowKey, Set<RowKey>>();
addExcludeColumnsToIgnoreList(spec);
double[][] clusters = initializeClusters(inData);
// also keep counts of how many patterns fall in a specific cluster
int[] clusterCoverage = new int[m_nrOfClusters.getIntValue()];
// --------- create clusters --------------
// reserve space for cluster center updates (do batch update!)
double[][] delta = new double[m_nrOfClusters.getIntValue()][];
for (int c = 0; c < m_nrOfClusters.getIntValue(); c++) {
delta[c] = new double[m_dimension - m_nrIgnoredColumns];
}
// main loop - until clusters stop changing or maxNrIterations reached
int currentIteration = 0;
boolean finished = false;
while ((!finished) && (currentIteration < m_nrMaxIterations.getIntValue())) {
exec.checkCanceled();
exec.setProgress((double) currentIteration / (double) m_nrMaxIterations.getIntValue(), "Iteration " + currentIteration);
// initialize counts and cluster-deltas
for (int c = 0; c < m_nrOfClusters.getIntValue(); c++) {
clusterCoverage[c] = 0;
delta[c] = new double[m_dimension - m_nrIgnoredColumns];
int deltaPos = 0;
for (int i = 0; i < m_dimension; i++) {
if (!m_ignoreColumn[i]) {
delta[c][deltaPos++] = 0.0;
}
}
}
// assume that we are done (i.e. clusters have stopped changing)
finished = true;
// first training example
RowIterator rowIt = inData.iterator();
while (rowIt.hasNext()) {
DataRow currentRow = rowIt.next();
int winner = findClosestPrototypeFor(currentRow, clusters);
if (winner >= 0) {
// update winning cluster centers delta
int deltaPos = 0;
for (int i = 0; i < m_dimension; i++) {
DataCell currentCell = currentRow.getCell(i);
if (!m_ignoreColumn[i]) {
if (!currentCell.isMissing()) {
delta[winner][deltaPos] += ((DoubleValue) (currentCell)).getDoubleValue();
} else {
throw new Exception("Missing Values not (yet) allowed in k-Means.");
}
deltaPos++;
}
}
clusterCoverage[winner]++;
} else {
// let's report this during
assert (winner >= 0);
// otherwise just don't reproduce result
throw new IllegalStateException("No winner found: " + winner);
}
}
// update cluster centers
finished = updateClusterCenters(clusterCoverage, clusters, delta);
currentIteration++;
}
// while(!finished & nrIt<maxNrIt)
// create list of feature names
// index of not-ignored columns
int k = 0;
// index of column
int j = 0;
String[] featureNames = new String[m_dimension];
do {
if (!m_ignoreColumn[j]) {
featureNames[k] = spec.getColumnSpec(j).getName();
k++;
}
j++;
} while (j < m_dimension);
// create output container and also mapping for HiLiteing
BufferedDataContainer labeledInput = exec.createDataContainer(createAppendedSpec(spec));
for (DataRow row : inData) {
int winner = findClosestPrototypeFor(row, clusters);
DataCell cell = new StringCell(CLUSTER + winner);
labeledInput.addRowToTable(new AppendedColumnRow(row, cell));
if (m_enableHilite.getBooleanValue()) {
RowKey key = new RowKey(CLUSTER + winner);
if (mapping.get(key) == null) {
Set<RowKey> set = new HashSet<RowKey>();
set.add(row.getKey());
mapping.put(key, set);
} else {
mapping.get(key).add(row.getKey());
}
}
}
labeledInput.close();
if (m_enableHilite.getBooleanValue()) {
m_translator.setMapper(new DefaultHiLiteMapper(mapping));
}
BufferedDataTable outData = labeledInput.getTable();
// handle the optional PMML input
PMMLPortObject inPMMLPort = m_pmmlInEnabled ? (PMMLPortObject) data[1] : null;
PMMLPortObjectSpec inPMMLSpec = null;
if (inPMMLPort != null) {
inPMMLSpec = inPMMLPort.getSpec();
}
PMMLPortObjectSpec pmmlOutSpec = createPMMLSpec(inPMMLSpec, spec);
PMMLPortObject outPMMLPort = new PMMLPortObject(pmmlOutSpec, inPMMLPort, spec);
Set<String> columns = new LinkedHashSet<String>();
for (String s : pmmlOutSpec.getLearningFields()) {
columns.add(s);
}
outPMMLPort.addModelTranslater(new PMMLClusterTranslator(ComparisonMeasure.squaredEuclidean, m_nrOfClusters.getIntValue(), clusters, clusterCoverage, columns));
m_viewData = new ClusterViewData(clusters, clusterCoverage, m_dimension - m_nrIgnoredColumns, featureNames);
if (m_outputCenters) {
DataContainer clusterCenterContainer = exec.createDataContainer(createClusterCentersSpec(spec));
int i = 0;
for (double[] cluster : clusters) {
List<DataCell> cells = new ArrayList<>();
for (double d : cluster) {
cells.add(new DoubleCell(d));
}
clusterCenterContainer.addRowToTable(new DefaultRow(new RowKey(PMMLClusterTranslator.CLUSTER_NAME_PREFIX + i++), cells));
}
clusterCenterContainer.close();
return new PortObject[] { outData, (BufferedDataTable) clusterCenterContainer.getTable(), outPMMLPort };
} else {
return new PortObject[] { outData, outPMMLPort };
}
}
Aggregations