use of org.knime.base.data.filter.column.FilterColumnTable in project knime-core by knime.
the class PolyRegLearnerNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
BufferedDataTable inTable = (BufferedDataTable) inData[0];
DataTableSpec inSpec = inTable.getDataTableSpec();
final int colCount = inSpec.getNumColumns();
String[] selectedCols = computeSelectedColumns(inSpec);
Set<String> hash = new HashSet<String>(Arrays.asList(selectedCols));
m_colSelected = new boolean[colCount];
for (int i = 0; i < colCount; i++) {
m_colSelected[i] = hash.contains(inTable.getDataTableSpec().getColumnSpec(i).getName());
}
final int rowCount = inTable.getRowCount();
String[] temp = new String[m_columnNames.length + 1];
System.arraycopy(m_columnNames, 0, temp, 0, m_columnNames.length);
temp[temp.length - 1] = m_settings.getTargetColumn();
FilterColumnTable filteredTable = new FilterColumnTable(inTable, temp);
final DataArray rowContainer = new DefaultDataArray(filteredTable, 1, m_settings.getMaxRowsForView());
// handle the optional PMML input
PMMLPortObject inPMMLPort = m_pmmlInEnabled ? (PMMLPortObject) inData[1] : null;
PortObjectSpec[] outputSpec = configure((inPMMLPort == null) ? new PortObjectSpec[] { inData[0].getSpec(), null } : new PortObjectSpec[] { inData[0].getSpec(), inPMMLPort.getSpec() });
Learner learner = new Learner((PMMLPortObjectSpec) outputSpec[0], 0d, m_settings.getMissingValueHandling() == MissingValueHandling.fail, m_settings.getDegree());
try {
PolyRegContent polyRegContent = learner.perform(inTable, exec);
m_betas = fillBeta(polyRegContent);
m_meanValues = polyRegContent.getMeans();
ColumnRearranger crea = new ColumnRearranger(inTable.getDataTableSpec());
crea.append(getCellFactory(inTable.getDataTableSpec().findColumnIndex(m_settings.getTargetColumn())));
PortObject[] bdt = new PortObject[] { createPMMLModel(inPMMLPort, inSpec), exec.createColumnRearrangeTable(inTable, crea, exec.createSilentSubExecutionContext(.2)), polyRegContent.createTablePortObject(exec.createSubExecutionContext(0.2)) };
m_squaredError /= rowCount;
if (polyRegContent.getWarningMessage() != null) {
setWarningMessage(polyRegContent.getWarningMessage());
}
double[] stdErrors = PolyRegViewData.mapToArray(polyRegContent.getStandardErrors(), m_columnNames, m_settings.getDegree(), polyRegContent.getInterceptStdErr());
double[] tValues = PolyRegViewData.mapToArray(polyRegContent.getTValues(), m_columnNames, m_settings.getDegree(), polyRegContent.getInterceptTValue());
double[] pValues = PolyRegViewData.mapToArray(polyRegContent.getPValues(), m_columnNames, m_settings.getDegree(), polyRegContent.getInterceptPValue());
m_viewData = new PolyRegViewData(m_meanValues, m_betas, stdErrors, tValues, pValues, m_squaredError, polyRegContent.getAdjustedRSquared(), m_columnNames, m_settings.getDegree(), m_settings.getTargetColumn(), rowContainer);
return bdt;
} catch (ModelSpecificationException e) {
final String origWarning = getWarningMessage();
final String warning = (origWarning != null && !origWarning.isEmpty()) ? (origWarning + "\n") : "" + e.getMessage();
setWarningMessage(warning);
final ExecutionContext subExec = exec.createSubExecutionContext(.1);
final BufferedDataContainer empty = subExec.createDataContainer(STATS_SPEC);
int rowIdx = 1;
for (final String column : m_columnNames) {
for (int d = 1; d <= m_settings.getDegree(); ++d) {
empty.addRowToTable(new DefaultRow("Row" + rowIdx++, new StringCell(column), new IntCell(d), new DoubleCell(0.0d), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell()));
}
}
empty.addRowToTable(new DefaultRow("Row" + rowIdx, new StringCell("Intercept"), new IntCell(0), new DoubleCell(0.0d), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell()));
double[] nans = new double[m_columnNames.length * m_settings.getDegree() + 1];
Arrays.fill(nans, Double.NaN);
m_betas = new double[nans.length];
// Mean only for the linear tags
m_meanValues = new double[nans.length / m_settings.getDegree()];
m_viewData = new PolyRegViewData(m_meanValues, m_betas, nans, nans, nans, m_squaredError, Double.NaN, m_columnNames, m_settings.getDegree(), m_settings.getTargetColumn(), rowContainer);
empty.close();
ColumnRearranger crea = new ColumnRearranger(inTable.getDataTableSpec());
crea.append(getCellFactory(inTable.getDataTableSpec().findColumnIndex(m_settings.getTargetColumn())));
BufferedDataTable rearrangerTable = exec.createColumnRearrangeTable(inTable, crea, exec.createSubProgress(0.6));
PMMLPortObject model = createPMMLModel(inPMMLPort, inTable.getDataTableSpec());
PortObject[] bdt = new PortObject[] { model, rearrangerTable, empty.getTable() };
return bdt;
}
}
use of org.knime.base.data.filter.column.FilterColumnTable in project knime-core by knime.
the class JoinerNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
BufferedDataContainer dc = exec.createDataContainer(JoinedTable.createSpec(inData[0].getDataTableSpec(), inData[1].getDataTableSpec(), m_method, m_suffix));
DataTable leftTable = inData[0];
DataTable rightTable = inData[1];
// in the output
if (JoinedTable.METHOD_FILTER.equals(m_method)) {
DataTableSpec leftTableSpec = leftTable.getDataTableSpec();
DataTableSpec rightTableSpec = rightTable.getDataTableSpec();
LinkedHashSet<String> leftHash = new LinkedHashSet<String>();
for (DataColumnSpec c : leftTableSpec) {
leftHash.add(c.getName());
}
LinkedHashSet<String> rightHash = new LinkedHashSet<String>();
for (DataColumnSpec c : rightTableSpec) {
rightHash.add(c.getName());
}
rightHash.removeAll(leftHash);
String[] survivors = rightHash.toArray(new String[rightHash.size()]);
if (survivors.length < rightTableSpec.getNumColumns()) {
rightTable = new FilterColumnTable(rightTable, survivors);
}
}
final BitSet rightRows = new BitSet(inData[1].getRowCount());
final LinkedHashMap<RowKey, SoftReference<Helper>> map = new LinkedHashMap<RowKey, SoftReference<Helper>>(1024);
m_leftRows = 0;
m_outputRows = 0;
m_leftIt = null;
m_rightIt = null;
m_firstMapHelper = null;
m_exec = exec;
if (m_ignoreMissingRows) {
m_max = Math.min(inData[0].getRowCount(), inData[1].getRowCount());
} else {
m_max = Math.max(inData[0].getRowCount(), inData[1].getRowCount());
}
while (true) {
if (!readLeftChunk(leftTable, map)) {
if (!m_ignoreMissingRows) {
processRemainingRightRows(dc, leftTable, rightTable, rightRows);
}
break;
}
if ((m_rightIt == null) || (!m_rightIt.hasNext()) || (rightRows.nextClearBit(0) <= m_rightIt.getIndex())) {
m_rightIt = new CounterRowIterator(rightTable.iterator());
}
while (m_rightIt.hasNext() && (map.size() > 0)) {
m_exec.checkCanceled();
DataRow rightRow = m_rightIt.next();
SoftReference<Helper> sr = map.get(rightRow.getKey());
if (sr != null) {
Helper h = sr.get();
if (h == null) {
map.remove(rightRow.getKey());
} else {
h.m_rightRow = rightRow;
h.m_rightIndex = m_rightIt.getIndex();
if (h.m_leftIndex == m_leftRows) {
// m_firstMapHelper = h;
assert h.m_predecessor == null || !map.containsKey(h.m_predecessor.m_leftRow.getKey());
h.m_predecessor = null;
DataRow joinedRow = new JoinedRow(h.m_leftRow, h.m_rightRow);
dc.addRowToTable(joinedRow);
map.remove(rightRow.getKey());
rightRows.set(m_rightIt.getIndex());
m_leftRows++;
m_outputRows++;
printProgress(rightRow.getKey());
}
}
}
}
processRemainingLeftRowsInMap(dc, rightTable, map, rightRows);
if (!m_ignoreMissingRows) {
if (rightRows.cardinality() == inData[1].getRowCount()) {
processRemainingLeftRowsInTable(dc, leftTable, rightTable);
}
} else {
m_leftRows += map.size();
map.clear();
if (rightRows.cardinality() == inData[1].getRowCount()) {
break;
}
}
}
m_leftIt = null;
m_rightIt = null;
m_exec = null;
m_firstMapHelper = null;
dc.close();
return new BufferedDataTable[] { dc.getTable() };
}
use of org.knime.base.data.filter.column.FilterColumnTable in project knime-core by knime.
the class DefaultVisualizationNodeModel method execute.
/**
* Converts the input data at inport 0 into a
* {@link org.knime.base.node.util.DataArray} with maximum number of rows as
* defined in the {@link DefaultVisualizationNodeDialog}. Thereby nominal
* columns are irgnored whose possible values are null or more than 60.
*
* {@inheritDoc}
*/
@Override
protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception {
// generate list of excluded columns, suppressing warning
findCompatibleColumns(inData[0].getDataTableSpec(), false);
DataTable filter = new FilterColumnTable(inData[0], false, getExcludedColumns());
m_input = new DefaultDataArray(filter, 1, m_maxRows.getIntValue(), exec);
if (m_maxRows.getIntValue() < inData[0].size()) {
setWarningMessage("Only the first " + m_maxRows.getIntValue() + " rows are displayed.");
}
return new BufferedDataTable[0];
}
use of org.knime.base.data.filter.column.FilterColumnTable in project knime-core by knime.
the class PolyRegLearnerNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
BufferedDataTable inTable = (BufferedDataTable) inData[0];
DataTableSpec inSpec = inTable.getDataTableSpec();
final int colCount = inSpec.getNumColumns();
String[] selectedCols = computeSelectedColumns(inSpec);
Set<String> hash = new HashSet<String>(Arrays.asList(selectedCols));
m_colSelected = new boolean[colCount];
for (int i = 0; i < colCount; i++) {
m_colSelected[i] = hash.contains(inTable.getDataTableSpec().getColumnSpec(i).getName());
}
final int rowCount = inTable.getRowCount();
final int independentVariables = selectedCols.length;
final int degree = m_settings.getDegree();
final int dependentIndex = inTable.getDataTableSpec().findColumnIndex(m_settings.getTargetColumn());
double[][] xMat = new double[rowCount][1 + independentVariables * degree];
double[][] yMat = new double[rowCount][1];
int rowIndex = 0;
for (DataRow row : inTable) {
exec.checkCanceled();
exec.setProgress(0.2 * rowIndex / rowCount);
xMat[rowIndex][0] = 1;
int colIndex = 1;
for (int i = 0; i < row.getNumCells(); i++) {
if ((m_colSelected[i] || (i == dependentIndex)) && row.getCell(i).isMissing()) {
throw new IllegalArgumentException("Missing values are not supported by this node.");
}
if (m_colSelected[i]) {
double val = ((DoubleValue) row.getCell(i)).getDoubleValue();
double poly = val;
xMat[rowIndex][colIndex] = poly;
colIndex++;
for (int d = 2; d <= degree; d++) {
poly *= val;
xMat[rowIndex][colIndex] = poly;
colIndex++;
}
} else if (i == dependentIndex) {
double val = ((DoubleValue) row.getCell(i)).getDoubleValue();
yMat[rowIndex][0] = val;
}
}
rowIndex++;
}
// compute X'
double[][] xTransMat = MathUtils.transpose(xMat);
exec.setProgress(0.24);
exec.checkCanceled();
// compute X'X
double[][] xxMat = MathUtils.multiply(xTransMat, xMat);
exec.setProgress(0.28);
exec.checkCanceled();
// compute X'Y
double[][] xyMat = MathUtils.multiply(xTransMat, yMat);
exec.setProgress(0.32);
exec.checkCanceled();
// compute (X'X)^-1
double[][] xxInverse;
try {
xxInverse = MathUtils.inverse(xxMat);
exec.setProgress(0.36);
exec.checkCanceled();
} catch (ArithmeticException ex) {
throw new ArithmeticException("The attributes of the data samples" + " are not mutually independent.");
}
// compute (X'X)^-1 * (X'Y)
final double[][] betas = MathUtils.multiply(xxInverse, xyMat);
exec.setProgress(0.4);
m_betas = new double[independentVariables * degree + 1];
for (int i = 0; i < betas.length; i++) {
m_betas[i] = betas[i][0];
}
m_columnNames = selectedCols;
String[] temp = new String[m_columnNames.length + 1];
System.arraycopy(m_columnNames, 0, temp, 0, m_columnNames.length);
temp[temp.length - 1] = m_settings.getTargetColumn();
FilterColumnTable filteredTable = new FilterColumnTable(inTable, temp);
DataArray rowContainer = new DefaultDataArray(filteredTable, 1, m_settings.getMaxRowsForView());
int ignore = rowContainer.getDataTableSpec().findColumnIndex(m_settings.getTargetColumn());
m_meanValues = new double[independentVariables];
for (DataRow row : rowContainer) {
int k = 0;
for (int i = 0; i < row.getNumCells(); i++) {
if (i != ignore) {
m_meanValues[k++] += ((DoubleValue) row.getCell(i)).getDoubleValue();
}
}
}
for (int i = 0; i < m_meanValues.length; i++) {
m_meanValues[i] /= rowContainer.size();
}
ColumnRearranger crea = new ColumnRearranger(inTable.getDataTableSpec());
crea.append(getCellFactory(inTable.getDataTableSpec().findColumnIndex(m_settings.getTargetColumn())));
// handle the optional PMML input
PMMLPortObject inPMMLPort = (PMMLPortObject) inData[1];
PortObject[] bdt = new PortObject[] { exec.createColumnRearrangeTable(inTable, crea, exec.createSubProgress(0.6)), createPMMLModel(inPMMLPort, inTable.getDataTableSpec()) };
m_squaredError /= rowCount;
m_viewData = new PolyRegViewData(m_meanValues, m_betas, m_squaredError, m_columnNames, m_settings.getDegree(), m_settings.getTargetColumn());
m_rowContainer = rowContainer;
return bdt;
}
use of org.knime.base.data.filter.column.FilterColumnTable in project knime-core by knime.
the class PMCCNodeModel method execute.
/**
* {@inheritDoc}
*/
@Override
protected PortObject[] execute(final PortObject[] inData, final ExecutionContext exec) throws Exception {
final BufferedDataTable in = (BufferedDataTable) inData[0];
// floating point operation
final double rC = in.getRowCount();
int[] includes = getIncludes(in.getDataTableSpec());
String[] includeNames = m_columnIncludesList.getIncludeList().toArray(new String[0]);
double progNormalize = 0.3;
double progDetermine = 0.65;
double progFinish = 1.0 - progNormalize - progDetermine;
exec.setMessage("Normalizing data");
final ExecutionMonitor normProg = exec.createSubProgress(progNormalize);
FilterColumnTable filterTable = new FilterColumnTable(in, includes);
final int l = includes.length;
int nomCount = (l - 1) * l / 2;
final HalfDoubleMatrix nominatorMatrix = new HalfDoubleMatrix(includes.length, /*withDiagonal*/
false);
nominatorMatrix.fill(Double.NaN);
@SuppressWarnings("unchecked") final LinkedHashMap<DataCell, Integer>[] possibleValues = new LinkedHashMap[l];
DataTableSpec filterTableSpec = filterTable.getDataTableSpec();
for (int i = 0; i < l; i++) {
DataColumnSpec cs = filterTableSpec.getColumnSpec(i);
if (cs.getType().isCompatible(NominalValue.class)) {
possibleValues[i] = new LinkedHashMap<DataCell, Integer>();
}
}
final int possValueUpperBound = m_maxPossValueCountModel.getIntValue();
// determines possible values. We can't use those from the domain
// as the domain can also contain values not present in the data
// but in the contingency table we need rows/columns to have at least
// one cell with a value >= 1
StatisticsTable statTable = new StatisticsTable(filterTable) {
// that is sort of the constructor in this derived class
{
calculateAllMoments(in.getRowCount(), normProg);
}
@Override
protected void calculateMomentInSubClass(final DataRow row) {
for (int i = 0; i < l; i++) {
if (possibleValues[i] != null) {
DataCell c = row.getCell(i);
// note: also take missing value as possible value
possibleValues[i].put(c, null);
if (possibleValues[i].size() > possValueUpperBound) {
possibleValues[i] = null;
}
}
}
}
};
for (LinkedHashMap<DataCell, Integer> map : possibleValues) {
if (map != null) {
int index = 0;
for (Map.Entry<DataCell, Integer> entry : map.entrySet()) {
entry.setValue(index++);
}
}
}
// stores all pair-wise contingency tables,
// contingencyTables[i] == null <--> either column of the corresponding
// pair is non-categorical.
// What is a contingency table?
// http://en.wikipedia.org/wiki/Contingency_table
int[][][] contingencyTables = new int[nomCount][][];
// column which only contain one value - no correlation available
LinkedHashSet<String> constantColumns = new LinkedHashSet<String>();
int valIndex = 0;
for (int i = 0; i < l; i++) {
for (int j = i + 1; j < l; j++) {
if (possibleValues[i] != null && possibleValues[j] != null) {
int iSize = possibleValues[i].size();
int jSize = possibleValues[j].size();
contingencyTables[valIndex] = new int[iSize][jSize];
}
DataColumnSpec colSpecI = filterTableSpec.getColumnSpec(i);
DataColumnSpec colSpecJ = filterTableSpec.getColumnSpec(j);
DataType ti = colSpecI.getType();
DataType tj = colSpecJ.getType();
if (ti.isCompatible(DoubleValue.class) && tj.isCompatible(DoubleValue.class)) {
// one of the two columns contains only one value
if (statTable.getVariance(i) < PMCCPortObjectAndSpec.ROUND_ERROR_OK) {
constantColumns.add(colSpecI.getName());
nominatorMatrix.set(i, j, Double.NaN);
} else if (statTable.getVariance(j) < PMCCPortObjectAndSpec.ROUND_ERROR_OK) {
constantColumns.add(colSpecJ.getName());
nominatorMatrix.set(i, j, Double.NaN);
} else {
nominatorMatrix.set(i, j, 0.0);
}
}
valIndex++;
}
}
// to other column (will be a missing value)
if (!constantColumns.isEmpty()) {
String[] constantColumnNames = constantColumns.toArray(new String[constantColumns.size()]);
NodeLogger.getLogger(getClass()).info("The following numeric " + "columns contain only one distinct value or have " + "otherwise a low standard deviation: " + Arrays.toString(constantColumnNames));
int maxLength = 4;
if (constantColumns.size() > maxLength) {
constantColumnNames = Arrays.copyOf(constantColumnNames, maxLength);
constantColumnNames[maxLength - 1] = "...";
}
setWarningMessage("Some columns contain only one distinct value: " + Arrays.toString(constantColumnNames));
}
DataTable att;
if (statTable.getNrRows() > 0) {
att = new Normalizer(statTable, includeNames).doZScoreNorm(// no iteration needed
exec.createSubProgress(0.0));
} else {
att = statTable;
}
normProg.setProgress(1.0);
exec.setMessage("Calculating correlation measure");
ExecutionMonitor detProg = exec.createSubProgress(progDetermine);
int rowIndex = 0;
double[] buf = new double[l];
DataCell[] catBuf = new DataCell[l];
boolean containsMissing = false;
for (DataRow r : att) {
detProg.checkCanceled();
for (int i = 0; i < l; i++) {
catBuf[i] = null;
buf[i] = Double.NaN;
DataCell c = r.getCell(i);
// missing value is also a possible value here
if (possibleValues[i] != null) {
catBuf[i] = c;
} else if (c.isMissing()) {
containsMissing = true;
} else if (filterTableSpec.getColumnSpec(i).getType().isCompatible(DoubleValue.class)) {
buf[i] = ((DoubleValue) c).getDoubleValue();
}
}
valIndex = 0;
for (int i = 0; i < l; i++) {
for (int j = i + 1; j < l; j++) {
double b1 = buf[i];
double b2 = buf[j];
if (!Double.isNaN(b1) && !Double.isNaN(b2)) {
double old = nominatorMatrix.get(i, j);
nominatorMatrix.set(i, j, old + b1 * b2);
} else if (catBuf[i] != null && catBuf[j] != null) {
int iIndex = possibleValues[i].get(catBuf[i]);
assert iIndex >= 0 : "Value unknown in value list " + "of column " + includeNames[i] + ": " + catBuf[i];
int jIndex = possibleValues[j].get(catBuf[j]);
assert jIndex >= 0 : "Value unknown in value list " + "of column " + includeNames[j] + ": " + catBuf[j];
contingencyTables[valIndex][iIndex][jIndex]++;
}
valIndex++;
}
}
rowIndex++;
detProg.setProgress(rowIndex / rC, "Processing row " + rowIndex + " (\"" + r.getKey() + "\")");
}
if (containsMissing) {
setWarningMessage("Some row(s) contained missing values.");
}
detProg.setProgress(1.0);
double normalizer = 1.0 / (rC - 1.0);
valIndex = 0;
for (int i = 0; i < l; i++) {
for (int j = i + 1; j < l; j++) {
if (contingencyTables[valIndex] != null) {
nominatorMatrix.set(i, j, computeCramersV(contingencyTables[valIndex]));
} else if (!Double.isNaN(nominatorMatrix.get(i, j))) {
double old = nominatorMatrix.get(i, j);
nominatorMatrix.set(i, j, old * normalizer);
}
// else pair of columns is double - string (for instance)
valIndex++;
}
}
normProg.setProgress(progDetermine);
PMCCPortObjectAndSpec pmccModel = new PMCCPortObjectAndSpec(includeNames, nominatorMatrix);
ExecutionContext subExec = exec.createSubExecutionContext(progFinish);
BufferedDataTable out = pmccModel.createCorrelationMatrix(subExec);
m_correlationTable = out;
return new PortObject[] { out, pmccModel };
}
Aggregations