use of org.knime.core.data.DataCell in project knime-core by knime.
the class HistogramColumn method createColumnRearranger.
/**
* Creates the rearranger that adds the histograms.
*
* @param data The input data table that contains the columns referred by {@code histograms} keys.
* @param stats The statistics table to be adjusted.
* @param histograms The histograms.
* @param columns The columns to be described.
* @return The {@link ColumnRearranger}.
*/
ColumnRearranger createColumnRearranger(final BufferedDataTable data, final BufferedDataTable stats, final Map<Integer, HistogramNumericModel> histograms, final int maxBinCount, final String... columns) {
ColumnRearranger rearranger = new ColumnRearranger(stats.getDataTableSpec());
final DataColumnSpec spec = createHistogramColumnSpec();
rearranger.append(new SingleCellFactory(true, spec) {
String[] m_sortedColumns = columns.clone();
{
Arrays.sort(m_sortedColumns);
}
@Override
public DataCell getCell(final DataRow row) {
if (Arrays.binarySearch(m_sortedColumns, row.getKey().getString()) < 0) {
return DataType.getMissingCell();
}
final int columnIndex = data.getSpec().findColumnIndex(row.getKey().getString());
final HistogramNumericModel histogramData = histograms.get(Integer.valueOf(columnIndex));
if (histogramData == null) {
// Wrong bounds
return DataType.getMissingCell();
}
assert columnIndex == histogramData.getColIndex() : "Expected: " + columnIndex + ", but got: " + histogramData.getColIndex();
return createImageCell(histogramData, false);
}
});
return rearranger;
}
use of org.knime.core.data.DataCell in project knime-core by knime.
the class HistogramColumn method constructFromDataArray.
/**
* Constructs the helper data structures from the numeric hostigran models and the data as {@link DataArray}.
*
* @param histograms The numeric histograms.
* @param data The input data.
* @param nominalColumnNames The nominal column names.
* @return The helper data structures.
* @see #construct(Map, DataTable, Set)
*/
protected static Pair<Map<Integer, Map<Integer, Set<RowKey>>>, Map<Integer, Map<DataValue, Set<RowKey>>>> constructFromDataArray(final Map<Integer, HistogramNumericModel> histograms, final DataTable data, final Set<String> nominalColumnNames) {
Map<Integer, Map<Integer, Set<RowKey>>> numericMapping = new HashMap<Integer, Map<Integer, Set<RowKey>>>();
Map<Integer, Map<DataValue, Set<RowKey>>> nominalMapping = new HashMap<Integer, Map<DataValue, Set<RowKey>>>();
DataTableSpec tableSpec = data.getDataTableSpec();
for (DataColumnSpec colSpec : tableSpec) {
int colIndex = tableSpec.findColumnIndex(colSpec.getName());
if (colSpec.getType().isCompatible(DoubleValue.class)) {
// + colIndex;
if (histograms.containsKey(Integer.valueOf(colIndex)) && histograms.get(colIndex) != null) {
numericMapping.put(colIndex, new HashMap<Integer, Set<RowKey>>());
}
}
if (colSpec.getDomain().hasValues() || nominalColumnNames.contains(colSpec.getName())) {
nominalMapping.put(colIndex, new HashMap<DataValue, Set<RowKey>>());
}
}
for (DataRow dataRow : data) {
for (Entry<Integer, Map<Integer, Set<RowKey>>> outer : numericMapping.entrySet()) {
Integer key = outer.getKey();
DataCell cell = dataRow.getCell(key);
if (cell instanceof DoubleValue) {
DoubleValue dv = (DoubleValue) cell;
Integer bin = Integer.valueOf(histograms.get(key).findBin(dv));
Map<Integer, Set<RowKey>> inner = outer.getValue();
if (!inner.containsKey(bin)) {
inner.put(bin, new HashSet<RowKey>());
}
inner.get(bin).add(dataRow.getKey());
}
}
for (Entry<Integer, Map<DataValue, Set<RowKey>>> outer : nominalMapping.entrySet()) {
int key = outer.getKey().intValue();
DataCell cell = dataRow.getCell(key);
if (!cell.isMissing()) /* && cell instanceof NominalValue*/
{
Map<DataValue, Set<RowKey>> inner = outer.getValue();
if (!inner.containsKey(cell)) {
inner.put(cell, new HashSet<RowKey>());
}
inner.get(cell).add(dataRow.getKey());
}
}
}
return Pair.create(numericMapping, nominalMapping);
}
use of org.knime.core.data.DataCell in project knime-core by knime.
the class HistogramColumn method loadHistograms.
/**
* Loads the histograms from the saved internal files.
*
* @param histogramsGz The file for the histograms.
* @param dataArrayGz The data array file for the row keys.
* @param nominalColumns The nominal columns.
* @param strategy The strategy used to compute the bins.
* @param means The mean values for the numeric columns.
* @return A triple (Pair(Pair(,),)) of histograms, numeric and nominal row keys.
* @throws IOException Failed to read the files.
* @throws InvalidSettingsException Something went wrong.
*/
public static Pair<Pair<Map<Integer, ? extends HistogramModel<?>>, Map<Integer, Map<Integer, Set<RowKey>>>>, Map<Integer, Map<DataValue, Set<RowKey>>>> loadHistograms(final File histogramsGz, final File dataArrayGz, final Set<String> nominalColumns, final BinNumberSelectionStrategy strategy, final double[] means) throws IOException, InvalidSettingsException {
Map<Integer, Map<Integer, Set<RowKey>>> numericKeys = new HashMap<Integer, Map<Integer, Set<RowKey>>>();
Map<Integer, HistogramNumericModel> histograms = loadHistogramsPrivate(histogramsGz, numericKeys, strategy, means);
Map<Integer, Map<DataValue, Set<RowKey>>> nominalKeys = new HashMap<Integer, Map<DataValue, Set<RowKey>>>();
ContainerTable table = DataContainer.readFromZip(dataArrayGz);
Set<Integer> numericColIndices = numericKeys.keySet();
for (String colName : nominalColumns) {
int colIndex = table.getDataTableSpec().findColumnIndex(colName);
if (colIndex < 0) {
continue;
}
nominalKeys.put(Integer.valueOf(colIndex), new HashMap<DataValue, Set<RowKey>>());
}
for (DataRow dataRow : table) {
for (Integer col : numericColIndices) {
// Integer col = Integer.valueOf(colIdx);
HistogramNumericModel hd = histograms.get(col);
Map<Integer, Set<RowKey>> map = numericKeys.get(col);
DataCell cell = dataRow.getCell(col.intValue());
if (!cell.isMissing() && cell instanceof DoubleValue) {
DoubleValue dv = (DoubleValue) cell;
Integer bin = Integer.valueOf(hd.findBin(dv));
if (!map.containsKey(bin)) {
map.put(bin, new HashSet<RowKey>());
}
map.get(bin).add(dataRow.getKey());
}
}
for (Entry<Integer, Map<DataValue, Set<RowKey>>> entry : nominalKeys.entrySet()) {
DataCell value = dataRow.getCell(entry.getKey().intValue());
Map<DataValue, Set<RowKey>> map = entry.getValue();
if (!map.containsKey(value)) {
map.put(value, new HashSet<RowKey>());
}
map.get(value).add(dataRow.getKey());
}
}
return Pair.create(new Pair<Map<Integer, ? extends HistogramModel<?>>, Map<Integer, Map<Integer, Set<RowKey>>>>(histograms, numericKeys), nominalKeys);
}
use of org.knime.core.data.DataCell in project knime-core by knime.
the class RankCorrelationComputeNodeModel method filterMissings.
/**
* @param filteredTable a Buffered Data Table.
* @param exec The execution context
* @return the table without any rows containing missing values.
*/
private BufferedDataTable filterMissings(final BufferedDataTable filteredTable, final ExecutionContext exec) {
BufferedDataContainer tab = exec.createDataContainer(filteredTable.getDataTableSpec());
for (DataRow row : filteredTable) {
boolean includeRow = true;
// check row for missingvalues
for (DataCell cell : row) {
if (cell.isMissing()) {
includeRow = false;
break;
}
}
if (includeRow) {
tab.addRowToTable(row);
}
}
tab.close();
return tab.getTable();
}
use of org.knime.core.data.DataCell in project knime-core by knime.
the class SortedCorrelationComputer method calculateKendall.
/**
* Calculates the kendall rank for all pairs of Data table columns based on previously calculated ranks.
*
* @param exec the Execution context.
* @param corrType the type of correlation used, as defined in CorrelationComputeNodeModel
* @return the output matrix to be turned into the output model
* @throws CanceledExecutionException if canceled by users
*/
HalfDoubleMatrix calculateKendall(final String corrType, final ExecutionMonitor exec) throws CanceledExecutionException {
// the ranking must have been calculated before
assert (m_rank != null);
final int coCount = m_rank.getDataTableSpec().getNumColumns();
HalfDoubleMatrix nominatorMatrix = new HalfDoubleMatrix(coCount, /*includeDiagonal=*/
false);
double[][] cMatrix = new double[coCount][coCount];
double[][] dMatrix = new double[coCount][coCount];
double[][] txMatrix = new double[coCount][coCount];
double[][] tyMatrix = new double[coCount][coCount];
// double[][] txyMatrix = new double[coCount][coCount];
final DataCell[] cells = new DataCell[m_rank.getDataTableSpec().getNumColumns()];
final DataCell[] cells2 = new DataCell[m_rank.getDataTableSpec().getNumColumns()];
int rowIndex = 0;
final int rowCount = m_rank.getRowCount();
for (DataRow r : m_rank) {
// multiple times, so we buffer it
for (int i = 0; i < cells.length; i++) {
cells[i] = r.getCell(i);
}
for (DataRow r2 : m_rank) {
exec.checkCanceled();
// multiple times, so we buffer it
for (int i = 0; i < cells2.length; i++) {
cells2[i] = r2.getCell(i);
}
for (int i = 0; i < coCount; i++) {
final double x1 = ((DoubleValue) cells[i]).getDoubleValue();
final double x2 = ((DoubleValue) cells2[i]).getDoubleValue();
for (int j = 0; j < coCount; j++) {
final double y1 = ((DoubleValue) cells[j]).getDoubleValue();
final double y2 = ((DoubleValue) cells2[j]).getDoubleValue();
if (x1 < x2 && y1 < y2) {
// values are concordant
cMatrix[i][j]++;
} else if (x1 < x2 && y1 > y2) {
// values are discordant
dMatrix[i][j]++;
} else if (x1 != x2 && y1 == y2) {
// values are bounded in y
tyMatrix[i][j]++;
} else if (x1 == x2 && y1 != y2) {
// values are bounded in x
txMatrix[i][j]++;
} else {
// (x1 == x2 && y1 == y2) { values are bounded in x and y
// txyMatrix[i][j]++; // no measure need this count
}
}
}
}
exec.checkCanceled();
exec.setProgress(0.95 * rowIndex / rowCount, String.format("Calculating - %d/%d (\"%s\")", rowIndex, rowCount, r.getKey()));
rowIndex++;
}
if (corrType.equals(RankCorrelationComputeNodeModel.CFG_KENDALLA)) {
double nrOfRows = m_rank.getRowCount();
// kendalls Tau a
double divisor = (nrOfRows * (nrOfRows - 1.0)) * 0.5;
for (int i = 0; i < coCount; i++) {
for (int j = i + 1; j < coCount; j++) {
nominatorMatrix.set(i, j, (cMatrix[i][j] - dMatrix[i][j]) / divisor);
}
exec.setProgress(0.05 * i / coCount, "Calculating correlations");
}
} else if (corrType.equals(RankCorrelationComputeNodeModel.CFG_KENDALLB)) {
// kendalls Tau b
for (int i = 0; i < coCount; i++) {
for (int j = i + 1; j < coCount; j++) {
double div = Math.sqrt(cMatrix[i][j] + dMatrix[i][j] + txMatrix[i][j]) * Math.sqrt(cMatrix[i][j] + dMatrix[i][j] + tyMatrix[i][j]);
nominatorMatrix.set(i, j, (cMatrix[i][j] - dMatrix[i][j]) / div);
}
exec.setProgress(0.05 * i / coCount, "Calculating correlations");
}
} else if (corrType.equals(RankCorrelationComputeNodeModel.CFG_KRUSKALAL)) {
// Kruskals Gamma
for (int i = 0; i < coCount; i++) {
for (int j = i + 1; j < coCount; j++) {
nominatorMatrix.set(i, j, (cMatrix[i][j] - dMatrix[i][j]) / (cMatrix[i][j] + dMatrix[i][j]));
}
exec.setProgress(0.05 * i / coCount, "Calculating correlations");
}
}
return nominatorMatrix;
}
Aggregations