use of org.knime.core.data.DoubleValue in project knime-core by knime.
the class CorrelationComputer method calculateStatistics.
/**
* First scan on the data. Calculates (pair wise) means and std dev
* and determines the list of distinct values for each categorical column.
* @param table ...
* @param exec ...
* @throws CanceledExecutionException
*/
@SuppressWarnings("unchecked")
public void calculateStatistics(final BufferedDataTable table, final ExecutionContext exec) throws CanceledExecutionException {
DataTableSpec filterTableSpec = table.getDataTableSpec();
assert filterTableSpec.equalStructure(m_tableSpec);
m_possibleValues = new LinkedHashMap[m_categoricalColIndexMap.length];
for (int i = 0; i < m_possibleValues.length; i++) {
m_possibleValues[i] = new LinkedHashMap<DataCell, Integer>();
}
final int numericColCount = m_numericColIndexMap.length;
double[][] sumMatrix = new double[numericColCount][numericColCount];
double[][] sumSqMatrix = new double[numericColCount][numericColCount];
HalfIntMatrix validCountMatrix = new HalfIntMatrix(numericColCount, true);
final DataCell[] cells = new DataCell[m_tableSpec.getNumColumns()];
long rowIndex = 0;
final long rowCount = table.size();
for (DataRow r : table) {
// multiple times, so we buffer it
for (int i = 0; i < cells.length; i++) {
cells[i] = r.getCell(i);
}
for (int i = 0; i < m_numericColIndexMap.length; i++) {
DataCell c = cells[m_numericColIndexMap[i]];
final boolean isMissing = c.isMissing();
if (isMissing) {
m_numericsWithMissings.add(m_numericColIndexMap[i]);
} else {
final double val = ((DoubleValue) c).getDoubleValue();
final double valSquare = val * val;
for (int j = 0; j < m_numericColIndexMap.length; j++) {
if (!cells[m_numericColIndexMap[j]].isMissing()) {
sumMatrix[i][j] += val;
sumSqMatrix[i][j] += valSquare;
if (j >= i) {
// don't count twice
validCountMatrix.add(i, j, 1);
}
}
}
}
}
for (int i = 0; i < m_categoricalColIndexMap.length; i++) {
DataCell c = r.getCell(m_categoricalColIndexMap[i]);
if (m_possibleValues[i] != null) {
// note: also take missing value as possible value
m_possibleValues[i].put(c, null);
if (m_possibleValues[i].size() > m_maxPossibleValues) {
m_possibleValues[i] = null;
}
}
}
exec.checkCanceled();
exec.setProgress(rowIndex / (double) rowCount, String.format("Calculating statistics - %d/%d (\"%s\")", rowIndex, rowCount, r.getKey()));
rowIndex += 1;
}
for (LinkedHashMap<DataCell, Integer> map : m_possibleValues) {
if (map != null) {
int index = 0;
for (Map.Entry<DataCell, Integer> entry : map.entrySet()) {
entry.setValue(index++);
}
}
}
// sumSqMatrix --> m_numericStdDevMatrix
for (int i = 0; i < numericColCount; i++) {
for (int j = 0; j < numericColCount; j++) {
final int validCount = validCountMatrix.get(i, j);
if (validCount > 1) {
double variance = (sumSqMatrix[i][j] - (sumMatrix[i][j] * sumMatrix[i][j]) / validCount) / (validCount - 1);
if (variance < PMCCPortObjectAndSpec.ROUND_ERROR_OK) {
variance = 0.0;
}
sumSqMatrix[i][j] = Math.sqrt(variance);
} else {
sumSqMatrix[i][j] = 0.0;
}
sumMatrix[i][j] = validCount > 0 ? sumMatrix[i][j] / validCount : Double.NaN;
}
}
m_numericMeanMatrix = sumMatrix;
m_numericStdDevMatrix = sumSqMatrix;
m_numericValidCountMatrix = validCountMatrix;
}
use of org.knime.core.data.DoubleValue in project knime-core by knime.
the class CorrelationComputer method calculateOutput.
/**
* Second scan on data. Computes the pair wise correlation for numeric
* columns and reads the contingency tables of pairs of categorical
* columns into memory.
* @param table ...
* @param exec ...
* @return the output matrix to be turned into the output model
* @throws CanceledExecutionException
*/
public HalfDoubleMatrix calculateOutput(final BufferedDataTable table, final ExecutionMonitor exec) throws CanceledExecutionException {
assert table.getDataTableSpec().equalStructure(m_tableSpec);
int catCount = m_categoricalColIndexMap.length;
int categoricalPairsCount = (catCount - 1) * catCount / 2;
// stores all pair-wise contingency tables,
// contingencyTables[i] == null <--> either column of the corresponding
// pair has more than m_maxPossibleValues values
// http://en.wikipedia.org/wiki/Contingency_table
int[][][] contingencyTables = new int[categoricalPairsCount][][];
int valIndex = 0;
for (int i = 0; i < m_categoricalColIndexMap.length; i++) {
for (int j = i + 1; j < m_categoricalColIndexMap.length; j++) {
LinkedHashMap<DataCell, Integer> valuesI = m_possibleValues[i];
LinkedHashMap<DataCell, Integer> valuesJ = m_possibleValues[j];
if (valuesI != null && valuesJ != null) {
int iSize = valuesI.size();
int jSize = valuesJ.size();
contingencyTables[valIndex] = new int[iSize][jSize];
}
valIndex++;
}
}
final int numColumns = m_tableSpec.getNumColumns();
HalfDoubleMatrix nominatorMatrix = new HalfDoubleMatrix(numColumns, /*includeDiagonal=*/
false);
nominatorMatrix.fill(Double.NaN);
long rowIndex = 0;
DataCell[] cells = new DataCell[numColumns];
final long rowCount = table.size();
for (int i = 0; i < m_numericColIndexMap.length; i++) {
final double stdDevI = m_numericStdDevMatrix[i][i];
if (stdDevI == 0.0) {
for (int j = i + 1; j < m_numericColIndexMap.length; j++) {
nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], Double.NaN);
}
m_numericsWithConstantValues.add(new Pair<Integer, Integer>(m_numericColIndexMap[i], null));
} else {
for (int j = i + 1; j < m_numericColIndexMap.length; j++) {
nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], 0.0);
final double stdDevJ = m_numericStdDevMatrix[j][j];
if (stdDevJ == 0.0) {
nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], Double.NaN);
// rest is fixed when j becomes the current value
// in the outer loop
} else {
double stdDevIUnderJ = m_numericStdDevMatrix[i][j];
double stdDevJUnderI = m_numericStdDevMatrix[j][i];
if (stdDevIUnderJ == 0.0) {
// all values in column i where j is not missing
// are constant
m_numericsWithConstantValues.add(new Pair<Integer, Integer>(m_numericColIndexMap[i], m_numericColIndexMap[j]));
nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], Double.NaN);
}
if (stdDevJUnderI == 0.0) {
// all values in column j where i is not missing
// are constant
m_numericsWithConstantValues.add(new Pair<Integer, Integer>(m_numericColIndexMap[j], m_numericColIndexMap[i]));
nominatorMatrix.set(m_numericColIndexMap[i], m_numericColIndexMap[j], Double.NaN);
}
}
}
}
}
for (DataRow r : table) {
for (int i = 0; i < cells.length; i++) {
cells[i] = r.getCell(i);
}
for (int i = 0; i < m_numericColIndexMap.length; i++) {
final DataCell ci = cells[m_numericColIndexMap[i]];
if (ci.isMissing()) {
continue;
}
if (m_numericStdDevMatrix[i][i] == 0.0) {
// constant column, reported above
continue;
}
final double di = ((DoubleValue) ci).getDoubleValue();
for (int j = i + 1; j < m_numericColIndexMap.length; j++) {
final DataCell cj = cells[m_numericColIndexMap[j]];
if (cj.isMissing()) {
continue;
}
final double meanI = m_numericMeanMatrix[i][j];
final double stdDevI = m_numericStdDevMatrix[i][j];
final double meanJ = m_numericMeanMatrix[j][i];
final double stdDevJ = m_numericStdDevMatrix[j][i];
if (stdDevI == 0.0 || stdDevJ == 0.0) {
// reported above
continue;
}
final double vi = (di - meanI) / stdDevI;
final double dj = ((DoubleValue) cj).getDoubleValue();
final double vj = (dj - meanJ) / stdDevJ;
nominatorMatrix.add(m_numericColIndexMap[i], m_numericColIndexMap[j], vi * vj);
}
}
valIndex = 0;
for (int i = 0; i < m_categoricalColIndexMap.length; i++) {
for (int j = i + 1; j < m_categoricalColIndexMap.length; j++, valIndex++) {
LinkedHashMap<DataCell, Integer> possibleValuesI = m_possibleValues[i];
LinkedHashMap<DataCell, Integer> possibleValuesJ = m_possibleValues[j];
if (possibleValuesI == null || possibleValuesJ == null) {
continue;
}
DataCell ci = r.getCell(m_categoricalColIndexMap[i]);
DataCell cj = r.getCell(m_categoricalColIndexMap[j]);
Integer indexI = possibleValuesI.get(ci);
Integer indexJ = possibleValuesJ.get(cj);
assert indexI != null && indexI >= 0 : String.format("Value unknown in value list of column \"%s-\": %s", table.getDataTableSpec().getColumnSpec(m_categoricalColIndexMap[i]).getName(), ci);
assert indexJ != null && indexJ >= 0 : String.format("Value unknown in value list of column \"%s-\": %s", table.getDataTableSpec().getColumnSpec(m_categoricalColIndexMap[j]).getName(), ci);
contingencyTables[valIndex][indexI][indexJ]++;
}
}
exec.checkCanceled();
exec.setProgress(rowIndex / (double) rowCount, String.format("Calculating statistics - %d/%d (\"%s\")", rowIndex, rowCount, r.getKey()));
rowIndex += 1;
}
for (int i = 0; i < m_numericColIndexMap.length; i++) {
for (int j = i + 1; j < m_numericColIndexMap.length; j++) {
final int trueI = m_numericColIndexMap[i];
final int trueJ = m_numericColIndexMap[j];
double t = nominatorMatrix.get(trueI, trueJ);
if (!Double.isNaN(t)) {
int validCount = m_numericValidCountMatrix.get(i, j);
nominatorMatrix.set(trueI, trueJ, t / (validCount - 1));
}
}
}
valIndex = 0;
for (int i = 0; i < m_categoricalColIndexMap.length; i++) {
for (int j = i + 1; j < m_categoricalColIndexMap.length; j++) {
int[][] contingencyTable = contingencyTables[valIndex];
double value;
if (contingencyTable == null) {
value = Double.NaN;
} else {
value = computeCramersV(contingencyTable);
}
nominatorMatrix.set(m_categoricalColIndexMap[i], m_categoricalColIndexMap[j], value);
valIndex++;
}
}
return nominatorMatrix;
}
use of org.knime.core.data.DoubleValue in project knime-core by knime.
the class Numeric2BitVectorMeanCellFactory method getCell.
/**
* {@inheritDoc}
*/
@Override
public DataCell getCell(final DataRow row) {
incrementNrOfRows();
org.knime.core.data.vector.bitvector.BitVectorCellFactory<? extends DataCell> factory = m_vectorType.getCellFactory(m_columns.length);
for (int i = 0; i < m_columns.length; i++) {
final DataCell cell = row.getCell(m_columns[i]);
if (cell.isMissing()) {
m_totalNrOf0s++;
continue;
}
if (cell instanceof DoubleValue) {
double currValue = ((DoubleValue) cell).getDoubleValue();
if (currValue >= (m_meanFactor * m_meanValues[i])) {
factory.set(i);
m_totalNrOf1s++;
} else {
m_totalNrOf0s++;
}
} else {
printError(LOGGER, row, "Incompatible type found.");
return DataType.getMissingCell();
}
}
return factory.createDataCell();
}
use of org.knime.core.data.DoubleValue in project knime-core by knime.
the class MedianTable method medianValues.
/**
* @param context An {@link ExecutionContext}
* @return The median values for the columns in the order of the columns specified in the constructor. The values
* can be {@link Double#NaN}s in certain circumstances.
* @throws CanceledExecutionException When cancelled.
*/
public synchronized double[] medianValues(final ExecutionContext context) throws CanceledExecutionException {
if (m_medians == null) {
m_medians = new double[m_indices.length];
int[] validCount = new int[m_indices.length];
for (DataRow row : m_table) {
context.checkCanceled();
for (int i = 0; i < m_indices.length; ++i) {
int col = m_indices[i];
final DataCell cell = row.getCell(col);
if (cell.isMissing()) {
if (m_includeMissingValues) {
validCount[i]++;
}
} else if (cell instanceof DoubleValue) {
DoubleValue dv = (DoubleValue) cell;
if (m_includeNaNs) {
validCount[i]++;
} else if (!Double.isNaN(dv.getDoubleValue())) {
validCount[i]++;
}
} else {
throw new IllegalStateException("Not a double value: " + cell + " in column: " + m_table.getSpec().getColumnSpec(col).getName());
}
}
}
List<String> incList = new ArrayList<String>(m_indices.length);
final String[] columnNames = m_table.getSpec().getColumnNames();
for (int i : m_indices) {
incList.add(columnNames[i]);
}
// two indices per column that denote the lower and upper index of the median value (or both the same)
long[][] k = new long[2][m_indices.length];
for (int i = 0; i < 2; i++) {
for (int j = 0; j < m_indices.length; j++) {
k[i][j] = validCount[j] > 0 ? (validCount[j] - 1 + i) / 2 : 0;
}
}
sortOnDisk(context, k);
}
return m_medians.clone();
}
use of org.knime.core.data.DoubleValue in project knime-core by knime.
the class DoubleMinMax method consumeRow.
/**
* {@inheritDoc}
*/
@Override
protected void consumeRow(final DataRow dataRow) {
int index = 0;
for (int i : getIndices()) {
DataCell cell = dataRow.getCell(i);
if (!cell.isMissing()) {
double val = ((DoubleValue) cell).getDoubleValue();
if (Double.isNaN(m_min[index]) || val < m_min[index]) {
if (!m_ignoreInfiniteValues || !Double.isInfinite(val)) {
m_min[index] = val;
}
}
if (Double.isNaN(m_max[index]) || val > m_max[index]) {
if (!m_ignoreInfiniteValues || !Double.isInfinite(val)) {
m_max[index] = val;
}
}
}
index++;
}
}
Aggregations