use of org.knime.core.data.DataCell in project knime-core by knime.
the class RuleEngineNodeModel method createRearranger.
private ColumnRearranger createRearranger(final DataTableSpec inSpec, final List<Rule> rules) throws InvalidSettingsException {
ColumnRearranger crea = new ColumnRearranger(inSpec);
String newColName = DataTableSpec.getUniqueColumnName(inSpec, m_settings.getNewColName());
final int defaultLabelColumnIndex;
if (m_settings.getDefaultLabelIsColumn()) {
if (m_settings.getDefaultLabel().length() < 3) {
throw new InvalidSettingsException("Default label is not a column reference");
}
if (!m_settings.getDefaultLabel().startsWith("$") || !m_settings.getDefaultLabel().endsWith("$")) {
throw new InvalidSettingsException("Column references in default label must be enclosed in $");
}
String colRef = m_settings.getDefaultLabel().substring(1, m_settings.getDefaultLabel().length() - 1);
defaultLabelColumnIndex = inSpec.findColumnIndex(colRef);
if (defaultLabelColumnIndex == -1) {
throw new InvalidSettingsException("Column '" + m_settings.getDefaultLabel() + "' for default label does not exist in input table");
}
} else {
defaultLabelColumnIndex = -1;
}
// determine output type
List<DataType> types = new ArrayList<DataType>();
// add outcome column types
for (Rule r : rules) {
if (r.getOutcome() instanceof ColumnReference) {
types.add(((ColumnReference) r.getOutcome()).spec.getType());
} else if (r.getOutcome() instanceof Double) {
types.add(DoubleCell.TYPE);
} else if (r.getOutcome() instanceof Integer) {
types.add(IntCell.TYPE);
} else if (r.getOutcome().toString().length() > 0) {
types.add(StringCell.TYPE);
}
}
if (defaultLabelColumnIndex >= 0) {
types.add(inSpec.getColumnSpec(defaultLabelColumnIndex).getType());
} else if (m_settings.getDefaultLabel().length() > 0) {
try {
Integer.parseInt(m_settings.getDefaultLabel());
types.add(IntCell.TYPE);
} catch (NumberFormatException ex) {
try {
Double.parseDouble(m_settings.getDefaultLabel());
types.add(DoubleCell.TYPE);
} catch (NumberFormatException ex1) {
types.add(StringCell.TYPE);
}
}
}
final DataType outType;
if (types.size() > 0) {
DataType temp = types.get(0);
for (int i = 1; i < types.size(); i++) {
temp = DataType.getCommonSuperType(temp, types.get(i));
}
if ((temp.getValueClasses().size() == 1) && temp.getValueClasses().contains(DataValue.class)) {
// a non-native type, we replace it with string
temp = StringCell.TYPE;
}
outType = temp;
} else {
outType = StringCell.TYPE;
}
DataColumnSpec cs = new DataColumnSpecCreator(newColName, outType).createSpec();
crea.append(new SingleCellFactory(cs) {
@Override
public DataCell getCell(final DataRow row) {
for (Rule r : rules) {
if (r.matches(row)) {
Object outcome = r.getOutcome();
if (outcome instanceof ColumnReference) {
DataCell cell = row.getCell(((ColumnReference) outcome).index);
if (outType.equals(StringCell.TYPE) && !cell.isMissing() && !cell.getType().equals(StringCell.TYPE)) {
return new StringCell(cell.toString());
} else {
return cell;
}
} else if (outType.equals(IntCell.TYPE)) {
return new IntCell((Integer) outcome);
} else if (outType.equals(DoubleCell.TYPE)) {
return new DoubleCell((Double) outcome);
} else {
return new StringCell(outcome.toString());
}
}
}
if (defaultLabelColumnIndex >= 0) {
DataCell cell = row.getCell(defaultLabelColumnIndex);
if (outType.equals(StringCell.TYPE) && !cell.getType().equals(StringCell.TYPE)) {
return new StringCell(cell.toString());
} else {
return cell;
}
} else if (m_settings.getDefaultLabel().length() > 0) {
String l = m_settings.getDefaultLabel();
if (outType.equals(StringCell.TYPE)) {
return new StringCell(l);
}
try {
int i = Integer.parseInt(l);
return new IntCell(i);
} catch (NumberFormatException ex) {
try {
double d = Double.parseDouble(l);
return new DoubleCell(d);
} catch (NumberFormatException ex1) {
return new StringCell(l);
}
}
} else {
return DataType.getMissingCell();
}
}
});
return crea;
}
use of org.knime.core.data.DataCell in project knime-core by knime.
the class LogRegLearnerNodeDialogPane method createTargetOptionsPanel.
/**
* Create options panel for the target.
*/
private JPanel createTargetOptionsPanel() {
JPanel p = new JPanel(new GridBagLayout());
GridBagConstraints c = new GridBagConstraints();
c.fill = GridBagConstraints.HORIZONTAL;
c.weightx = 0;
c.weighty = 0;
c.gridx = 0;
c.gridy = 0;
c.anchor = GridBagConstraints.BASELINE_LEADING;
c.insets = new Insets(5, 5, 0, 0);
p.add(new JLabel("Target Column:"), c);
c.gridx++;
m_selectionPanel = new ColumnSelectionPanel(new EmptyBorder(0, 0, 0, 0), NominalValue.class);
m_selectionPanel.addActionListener(new ActionListener() {
@Override
public void actionPerformed(final ActionEvent e) {
updateTargetCategories((DataCell) m_targetReferenceCategory.getSelectedItem());
}
});
p.add(m_selectionPanel, c);
c.gridx = 0;
c.gridy++;
p.add(new JLabel("Reference Category:"), c);
c.gridx++;
m_targetReferenceCategory = new JComboBox();
p.add(m_targetReferenceCategory, c);
c.gridx = 0;
c.gridy++;
c.gridwidth = 3;
c.weightx = 1;
m_notSortTarget = new JCheckBox("Use order from target column domain (only relevant for output representation)");
p.add(m_notSortTarget, c);
m_selectionPanel.addItemListener(new ItemListener() {
@Override
public void itemStateChanged(final ItemEvent e) {
Object selected = e.getItem();
if (selected instanceof DataColumnSpec) {
m_filterPanel.resetHiding();
m_filterPanel.hideColumns((DataColumnSpec) selected);
}
}
});
return p;
}
use of org.knime.core.data.DataCell in project knime-core by knime.
the class LogRegLearner method checkConstantLearningFields.
private void checkConstantLearningFields(final BufferedDataTable data, final PMMLPortObjectSpec inPMMLSpec) throws InvalidSettingsException {
Set<String> exclude = new HashSet<String>();
for (DataColumnSpec colSpec : m_pmmlOutSpec.getLearningCols()) {
if (colSpec.getType().isCompatible(DoubleValue.class)) {
final DataColumnDomain domain = colSpec.getDomain();
final DataCell lowerBound = domain.getLowerBound();
final DataCell upperBound = domain.getUpperBound();
assert lowerBound != null || data.size() == 0 : "Non empty table must have domain set at this point";
if (ObjectUtils.equals(lowerBound, upperBound)) {
exclude.add(colSpec.getName());
}
}
}
if (!exclude.isEmpty()) {
StringBuilder warning = new StringBuilder();
warning.append(exclude.size() == 1 ? "Column " : "Columns ");
warning.append(ConvenienceMethods.getShortStringFrom(exclude, 5));
warning.append(exclude.size() == 1 ? " has a constant value " : " have constant values ");
warning.append(" - will be ignored during training");
LOGGER.warn(warning.toString());
m_warningMessage = (m_warningMessage == null ? "" : m_warningMessage + "\n") + warning.toString();
// re-init learner so that it has the correct learning columns
init(data.getDataTableSpec(), inPMMLSpec, exclude);
}
}
use of org.knime.core.data.DataCell in project knime-core by knime.
the class LogRegLearner method init.
/**
* Initialize instance and check if settings are consistent.
*/
private void init(final DataTableSpec inSpec, final PMMLPortObjectSpec pmmlSpec, final Set<String> exclude) throws InvalidSettingsException {
List<String> inputCols = new ArrayList<String>();
for (DataColumnSpec column : inSpec) {
inputCols.add(column.getName());
}
if (!m_settings.getIncludeAll()) {
List<String> included = Arrays.asList(m_settings.getIncludedColumns());
if (!inputCols.containsAll(included)) {
LOGGER.warn("Input does not contain all learning columns. " + "Proceed with the remaining learning columns.");
}
inputCols.retainAll(included);
}
inputCols.remove(m_settings.getTargetColumn());
if (inputCols.isEmpty()) {
throw new InvalidSettingsException("At least one column must " + "be included.");
}
DataColumnSpec targetColSpec = null;
List<DataColumnSpec> regressorColSpecs = new ArrayList<DataColumnSpec>();
// Auto configuration when target is not set
if (null == m_settings.getTargetColumn() && m_settings.getIncludeAll()) {
for (int i = 0; i < inSpec.getNumColumns(); i++) {
DataColumnSpec colSpec = inSpec.getColumnSpec(i);
String colName = colSpec.getName();
inputCols.remove(colName);
if (colSpec.getType().isCompatible(NominalValue.class)) {
m_settings.setTargetColumn(colName);
}
}
// when there is no column with nominal data
if (null == m_settings.getTargetColumn()) {
throw new InvalidSettingsException("No column in " + "spec compatible to \"NominalValue\".");
}
}
// remove all columns that should not be used
inputCols.removeAll(exclude);
for (int i = 0; i < inSpec.getNumColumns(); i++) {
DataColumnSpec colSpec = inSpec.getColumnSpec(i);
String colName = colSpec.getName();
if (m_settings.getTargetColumn().equals(colName)) {
if (colSpec.getType().isCompatible(NominalValue.class)) {
targetColSpec = colSpec;
} else {
throw new InvalidSettingsException("Type of column \"" + colName + "\" is not nominal.");
}
} else if (inputCols.contains(colName)) {
if (colSpec.getType().isCompatible(DoubleValue.class) || colSpec.getType().isCompatible(NominalValue.class)) {
regressorColSpecs.add(colSpec);
} else {
throw new InvalidSettingsException("Type of column \"" + colName + "\" is not one of the allowed types, " + "which are numeric or nomial.");
}
}
}
if (null != targetColSpec) {
// Check if target has at least two categories.
final Set<DataCell> targetValues = targetColSpec.getDomain().getValues();
if (targetValues != null && targetValues.size() < 2) {
throw new InvalidSettingsException("The target column \"" + targetColSpec.getName() + "\" has one value, only. " + "At least two target categories are expected.");
}
String[] learnerCols = new String[regressorColSpecs.size() + 1];
for (int i = 0; i < regressorColSpecs.size(); i++) {
learnerCols[i] = regressorColSpecs.get(i).getName();
}
learnerCols[learnerCols.length - 1] = targetColSpec.getName();
PMMLPortObjectSpecCreator creator = new PMMLPortObjectSpecCreator(pmmlSpec, inSpec);
creator.setTargetCols(Arrays.asList(targetColSpec));
creator.setLearningCols(regressorColSpecs);
m_pmmlOutSpec = creator.createSpec();
m_learner = new Learner(m_pmmlOutSpec, m_settings.getTargetReferenceCategory(), m_settings.getSortTargetCategories(), m_settings.getSortIncludesCategories());
} else {
throw new InvalidSettingsException("The target is " + "not in the input.");
}
}
use of org.knime.core.data.DataCell in project knime-core by knime.
the class Learner method perform.
/**
* @param data The data table.
* @param exec The execution context used for reporting progress.
* @return An object which holds the results.
* @throws CanceledExecutionException when method is cancelled
* @throws InvalidSettingsException When settings are inconsistent with the data
*/
public LogisticRegressionContent perform(final BufferedDataTable data, final ExecutionContext exec) throws CanceledExecutionException, InvalidSettingsException {
exec.checkCanceled();
int iter = 0;
boolean converged = false;
final RegressionTrainingData trainingData = new RegressionTrainingData(data, m_outSpec, m_specialColumns, true, m_targetReferenceCategory, m_sortTargetCategories, m_sortFactorsCategories);
int targetIndex = data.getDataTableSpec().findColumnIndex(m_outSpec.getTargetCols().get(0).getName());
final int tcC = trainingData.getDomainValues().get(targetIndex).size();
final int rC = trainingData.getRegressorCount();
final RealMatrix beta = new Array2DRowRealMatrix(1, (tcC - 1) * (rC + 1));
Double loglike = 0.0;
Double loglikeOld = 0.0;
exec.setMessage("Iterative optimization. Processing iteration 1.");
// main loop
while (iter < m_maxIter && !converged) {
RealMatrix betaOld = beta.copy();
loglikeOld = loglike;
// Do heavy work in a separate thread which allows to interrupt it
// note the queue may block if no more threads are available (e.g. thread count = 1)
// as soon as we stall in 'get' this thread reduces the number of running thread
Future<Double> future = ThreadPool.currentPool().enqueue(new Callable<Double>() {
@Override
public Double call() throws Exception {
final ExecutionMonitor progMon = exec.createSubProgress(1.0 / m_maxIter);
irlsRls(trainingData, beta, rC, tcC, progMon);
progMon.setProgress(1.0);
return likelihood(trainingData.iterator(), beta, rC, tcC, exec);
}
});
try {
loglike = future.get();
} catch (InterruptedException e) {
future.cancel(true);
exec.checkCanceled();
throw new RuntimeException(e);
} catch (ExecutionException e) {
if (e.getCause() instanceof RuntimeException) {
throw (RuntimeException) e.getCause();
} else {
throw new RuntimeException(e.getCause());
}
}
if (Double.isInfinite(loglike) || Double.isNaN(loglike)) {
throw new RuntimeException(FAILING_MSG);
}
exec.checkCanceled();
// test for decreasing likelihood
while ((Double.isInfinite(loglike) || Double.isNaN(loglike) || loglike < loglikeOld) && iter > 0) {
converged = true;
for (int k = 0; k < beta.getRowDimension(); k++) {
if (abs(beta.getEntry(k, 0) - betaOld.getEntry(k, 0)) > m_eps * abs(betaOld.getEntry(k, 0))) {
converged = false;
break;
}
}
if (converged) {
break;
}
// half the step size of beta
beta.setSubMatrix((beta.add(betaOld)).scalarMultiply(0.5).getData(), 0, 0);
exec.checkCanceled();
loglike = likelihood(trainingData.iterator(), beta, rC, tcC, exec);
exec.checkCanceled();
}
// test for convergence
converged = true;
for (int k = 0; k < beta.getRowDimension(); k++) {
if (abs(beta.getEntry(k, 0) - betaOld.getEntry(k, 0)) > m_eps * abs(betaOld.getEntry(k, 0))) {
converged = false;
break;
}
}
iter++;
LOGGER.debug("#Iterations: " + iter);
LOGGER.debug("Log Likelihood: " + loglike);
StringBuilder betaBuilder = new StringBuilder();
for (int i = 0; i < beta.getRowDimension() - 1; i++) {
betaBuilder.append(Double.toString(beta.getEntry(i, 0)));
betaBuilder.append(", ");
}
if (beta.getRowDimension() > 0) {
betaBuilder.append(Double.toString(beta.getEntry(beta.getRowDimension() - 1, 0)));
}
LOGGER.debug("beta: " + betaBuilder.toString());
exec.checkCanceled();
exec.setMessage("Iterative optimization. #Iterations: " + iter + " | Log-likelihood: " + DoubleFormat.formatDouble(loglike) + ". Processing iteration " + (iter + 1) + ".");
}
// The covariance matrix
RealMatrix covMat = new QRDecomposition(A).getSolver().getInverse().scalarMultiply(-1);
List<String> factorList = new ArrayList<String>();
List<String> covariateList = new ArrayList<String>();
Map<String, List<DataCell>> factorDomainValues = new HashMap<String, List<DataCell>>();
for (int i : trainingData.getActiveCols()) {
DataColumnSpec columnSpec = data.getDataTableSpec().getColumnSpec(i);
if (trainingData.getIsNominal().get(i)) {
String factor = columnSpec.getName();
factorList.add(factor);
List<DataCell> values = trainingData.getDomainValues().get(i);
factorDomainValues.put(factor, values);
} else {
if (columnSpec.getType().isCompatible(BitVectorValue.class) || columnSpec.getType().isCompatible(ByteVectorValue.class)) {
int length = trainingData.getVectorLengths().getOrDefault(i, 0).intValue();
for (int j = 0; j < length; ++j) {
covariateList.add(columnSpec.getName() + "[" + j + "]");
}
} else {
covariateList.add(columnSpec.getName());
}
}
}
final Map<? extends Integer, Integer> vectorIndexLengths = trainingData.getVectorLengths();
final Map<String, Integer> vectorLengths = new LinkedHashMap<String, Integer>();
for (DataColumnSpec spec : m_specialColumns) {
int colIndex = data.getSpec().findColumnIndex(spec.getName());
if (colIndex >= 0) {
vectorLengths.put(spec.getName(), vectorIndexLengths.get(colIndex));
}
}
// create content
LogisticRegressionContent content = new LogisticRegressionContent(m_outSpec, factorList, covariateList, vectorLengths, m_targetReferenceCategory, m_sortTargetCategories, m_sortFactorsCategories, beta, loglike, covMat, iter);
return content;
}
Aggregations