use of org.knime.core.data.container.AbstractCellFactory in project knime-core by knime.
the class PMMLRuleSetPredictorNodeModel method createRearranger.
/**
* Constructs the {@link ColumnRearranger} for computing the new columns.
*
* @param obj The {@link PMMLPortObject} of the preprocessing model.
* @param spec The {@link DataTableSpec} of the table.
* @param replaceColumn Should replace the {@code outputColumnName}?
* @param outputColumnName The output column name (which might be an existing).
* @param addConfidence Should add the confidence values to a column?
* @param confidenceColumnName The name of the confidence column.
* @param validationColumnIdx Index of the validation column, {@code -1} if not specified.
* @param processConcurrently Should be {@code false} when the statistics are to be computed.
* @return The {@link ColumnRearranger} computing the result.
* @throws InvalidSettingsException Problem with rules.
*/
private static ColumnRearranger createRearranger(final PMMLPortObject obj, final DataTableSpec spec, final boolean replaceColumn, final String outputColumnName, final boolean addConfidence, final String confidenceColumnName, final int validationColumnIdx, final boolean processConcurrently) throws InvalidSettingsException {
List<Node> models = obj.getPMMLValue().getModels(PMMLModelType.RuleSetModel);
if (models.size() != 1) {
throw new InvalidSettingsException("Expected exactly on RuleSetModel, but got: " + models.size());
}
final PMMLRuleTranslator translator = new PMMLRuleTranslator();
obj.initializeModelTranslator(translator);
if (!translator.isScorable()) {
throw new UnsupportedOperationException("The model is not scorable.");
}
final List<PMMLRuleTranslator.Rule> rules = translator.getRules();
ColumnRearranger ret = new ColumnRearranger(spec);
final List<DataColumnSpec> targetCols = obj.getSpec().getTargetCols();
final DataType dataType = targetCols.isEmpty() ? StringCell.TYPE : targetCols.get(0).getType();
DataColumnSpecCreator specCreator = new DataColumnSpecCreator(outputColumnName, dataType);
Set<DataCell> outcomes = new LinkedHashSet<>();
for (Rule rule : rules) {
DataCell outcome;
if (dataType.equals(BooleanCell.TYPE)) {
outcome = BooleanCellFactory.create(rule.getOutcome());
} else if (dataType.equals(StringCell.TYPE)) {
outcome = new StringCell(rule.getOutcome());
} else if (dataType.equals(DoubleCell.TYPE)) {
try {
outcome = new DoubleCell(Double.parseDouble(rule.getOutcome()));
} catch (NumberFormatException e) {
// ignore
continue;
}
} else if (dataType.equals(IntCell.TYPE)) {
try {
outcome = new IntCell(Integer.parseInt(rule.getOutcome()));
} catch (NumberFormatException e) {
// ignore
continue;
}
} else if (dataType.equals(LongCell.TYPE)) {
try {
outcome = new LongCell(Long.parseLong(rule.getOutcome()));
} catch (NumberFormatException e) {
// ignore
continue;
}
} else {
throw new UnsupportedOperationException("Unknown outcome type: " + dataType);
}
outcomes.add(outcome);
}
specCreator.setDomain(new DataColumnDomainCreator(outcomes).createDomain());
DataColumnSpec colSpec = specCreator.createSpec();
final RuleSelectionMethod ruleSelectionMethod = translator.getSelectionMethodList().get(0);
final String defaultScore = translator.getDefaultScore();
final Double defaultConfidence = translator.getDefaultConfidence();
final DataColumnSpec[] specs;
if (addConfidence) {
specs = new DataColumnSpec[] { new DataColumnSpecCreator(DataTableSpec.getUniqueColumnName(ret.createSpec(), confidenceColumnName), DoubleCell.TYPE).createSpec(), colSpec };
} else {
specs = new DataColumnSpec[] { colSpec };
}
final int oldColumnIndex = replaceColumn ? ret.indexOf(outputColumnName) : -1;
ret.append(new AbstractCellFactory(processConcurrently, specs) {
private final List<String> m_values;
{
Map<String, List<String>> dd = translator.getDataDictionary();
m_values = dd.get(targetCols.get(0).getName());
}
/**
* {@inheritDoc}
*/
@Override
public DataCell[] getCells(final DataRow row) {
// See http://www.dmg.org/v4-1/RuleSet.html#Rule
switch(ruleSelectionMethod.getCriterion().intValue()) {
case RuleSelectionMethod.Criterion.INT_FIRST_HIT:
{
Pair<DataCell, Double> resultAndConfidence = selectFirstHit(row);
return toCells(resultAndConfidence);
}
case RuleSelectionMethod.Criterion.INT_WEIGHTED_MAX:
{
Pair<DataCell, Double> resultAndConfidence = selectWeightedMax(row);
return toCells(resultAndConfidence);
}
case RuleSelectionMethod.Criterion.INT_WEIGHTED_SUM:
{
Pair<DataCell, Double> resultAndConfidence = selectWeightedSum(row);
return toCells(resultAndConfidence);
}
default:
throw new UnsupportedOperationException(ruleSelectionMethod.getCriterion().toString());
}
}
/**
* Converts the pair to a {@link DataCell} array.
*
* @param resultAndConfidence The {@link Pair}.
* @return The result and possibly the confidence.
*/
private DataCell[] toCells(final Pair<DataCell, Double> resultAndConfidence) {
if (!addConfidence) {
return new DataCell[] { resultAndConfidence.getFirst() };
}
if (resultAndConfidence.getSecond() == null) {
return new DataCell[] { DataType.getMissingCell(), resultAndConfidence.getFirst() };
}
return new DataCell[] { new DoubleCell(resultAndConfidence.getSecond()), resultAndConfidence.getFirst() };
}
/**
* Computes the result and the confidence using the weighted sum method.
*
* @param row A {@link DataRow}
* @return The result and the confidence.
*/
private Pair<DataCell, Double> selectWeightedSum(final DataRow row) {
final Map<String, Double> scoreToSumWeight = new LinkedHashMap<String, Double>();
for (String val : m_values) {
scoreToSumWeight.put(val, 0.0);
}
int matchedRuleCount = 0;
for (final PMMLRuleTranslator.Rule rule : rules) {
if (rule.getCondition().evaluate(row, spec) == Boolean.TRUE) {
++matchedRuleCount;
Double sumWeight = scoreToSumWeight.get(rule.getOutcome());
if (sumWeight == null) {
throw new IllegalStateException("The score value: " + rule.getOutcome() + " is not in the data dictionary.");
}
final Double wRaw = rule.getWeight();
final double w = wRaw == null ? 0.0 : wRaw.doubleValue();
scoreToSumWeight.put(rule.getOutcome(), sumWeight + w);
}
}
double maxSumWeight = Double.NEGATIVE_INFINITY;
String bestScore = null;
for (Entry<String, Double> entry : scoreToSumWeight.entrySet()) {
final double d = entry.getValue().doubleValue();
if (d > maxSumWeight) {
maxSumWeight = d;
bestScore = entry.getKey();
}
}
if (bestScore == null || matchedRuleCount == 0) {
return pair(result(defaultScore), defaultConfidence);
}
return pair(result(bestScore), maxSumWeight / matchedRuleCount);
}
/**
* Helper method to create {@link Pair}s.
*
* @param f The first element.
* @param s The second element.
* @return The new pair.
*/
private <F, S> Pair<F, S> pair(final F f, final S s) {
return new Pair<F, S>(f, s);
}
/**
* Computes the result and the confidence using the weighted max method.
*
* @param row A {@link DataRow}
* @return The result and the confidence.
*/
private Pair<DataCell, Double> selectWeightedMax(final DataRow row) {
double maxWeight = Double.NEGATIVE_INFINITY;
PMMLRuleTranslator.Rule bestRule = null;
for (final PMMLRuleTranslator.Rule rule : rules) {
if (rule.getCondition().evaluate(row, spec) == Boolean.TRUE) {
if (rule.getWeight() > maxWeight) {
maxWeight = rule.getWeight();
bestRule = rule;
}
}
}
if (bestRule == null) {
return pair(result(defaultScore), defaultConfidence);
}
bestRule.setRecordCount(bestRule.getRecordCount() + 1);
DataCell result = result(bestRule);
if (validationColumnIdx >= 0) {
if (row.getCell(validationColumnIdx).equals(result)) {
bestRule.setNbCorrect(bestRule.getNbCorrect() + 1);
}
}
Double confidence = bestRule.getConfidence();
return pair(result, confidence == null ? defaultConfidence : confidence);
}
/**
* Selects the outcome of the rule and converts it to the proper outcome type.
*
* @param rule A {@link Rule}.
* @return The {@link DataCell} representing the result. (May be missing.)
*/
private DataCell result(final PMMLRuleTranslator.Rule rule) {
String outcome = rule.getOutcome();
return result(outcome);
}
/**
* Constructs the {@link DataCell} from its {@link String} representation ({@code outcome}) and its type.
*
* @param dataType The expected {@link DataType}
* @param outcome The {@link String} representation.
* @return The {@link DataCell}.
*/
private DataCell result(final String outcome) {
if (outcome == null) {
return DataType.getMissingCell();
}
try {
if (dataType.isCompatible(BooleanValue.class)) {
return BooleanCellFactory.create(outcome);
}
if (IntCell.TYPE.isASuperTypeOf(dataType)) {
return new IntCell(Integer.parseInt(outcome));
}
if (LongCell.TYPE.isASuperTypeOf(dataType)) {
return new LongCell(Long.parseLong(outcome));
}
if (DoubleCell.TYPE.isASuperTypeOf(dataType)) {
return new DoubleCell(Double.parseDouble(outcome));
}
return new StringCell(outcome);
} catch (NumberFormatException e) {
return new MissingCell(outcome + "\n" + e.getMessage());
}
}
/**
* Selects the first rule that matches and computes the confidence and result for the {@code row}.
*
* @param row A {@link DataRow}.
* @return The result and the confidence.
*/
private Pair<DataCell, Double> selectFirstHit(final DataRow row) {
for (final PMMLRuleTranslator.Rule rule : rules) {
Boolean eval = rule.getCondition().evaluate(row, spec);
if (eval == Boolean.TRUE) {
rule.setRecordCount(rule.getRecordCount() + 1);
DataCell result = result(rule);
if (validationColumnIdx >= 0) {
if (row.getCell(validationColumnIdx).equals(result)) {
rule.setNbCorrect(rule.getNbCorrect() + 1);
}
}
Double confidence = rule.getConfidence();
return pair(result, confidence == null ? defaultConfidence : confidence);
}
}
return pair(result(defaultScore), defaultConfidence);
}
/**
* {@inheritDoc}
*/
@Override
public void afterProcessing() {
super.afterProcessing();
obj.getPMMLValue();
RuleSetModel ruleSet = translator.getOriginalRuleSetModel();
assert rules.size() == ruleSet.getRuleSet().getSimpleRuleList().size() + ruleSet.getRuleSet().getCompoundRuleList().size();
if (ruleSet.getRuleSet().getSimpleRuleList().size() == rules.size()) {
for (int i = 0; i < rules.size(); ++i) {
Rule rule = rules.get(i);
final SimpleRule simpleRuleArray = ruleSet.getRuleSet().getSimpleRuleArray(i);
synchronized (simpleRuleArray) /*synchronized fixes AP-6766 */
{
simpleRuleArray.setRecordCount(rule.getRecordCount());
if (validationColumnIdx >= 0) {
simpleRuleArray.setNbCorrect(rule.getNbCorrect());
} else if (simpleRuleArray.isSetNbCorrect()) {
simpleRuleArray.unsetNbCorrect();
}
}
}
}
}
});
if (replaceColumn) {
ret.remove(outputColumnName);
ret.move(ret.getColumnCount() - 1 - (addConfidence ? 1 : 0), oldColumnIndex);
}
return ret;
}
use of org.knime.core.data.container.AbstractCellFactory in project knime-core by knime.
the class NumericOutliersReviser method replaceOutliers.
/**
* Replaces outliers found in the row input according to the selected replacement option. Additionally, the outlier
* replacement counts and new domains are calculated.
*
* @param exec the execution context
* @param in the row input whose outliers have to be treated
* @param out the row output whose outliers have been treated
* @param outlierModel the model storing the permitted intervals
* @param memberCounter the member counter
* @param outlierRepCounter the outlier replacement counter
* @param missingGroupsCounter the missing groups counter
* @throws Exception any exception to indicate an error, cancelation
*/
private void replaceOutliers(final ExecutionContext exec, final RowInput in, final RowOutput out, final NumericOutliersModel outlierModel, final MemberCounter memberCounter, final MemberCounter outlierRepCounter, final MemberCounter missingGroupsCounter) throws Exception {
// total number of outlier columns
final int noOutliers = m_outlierColNames.length;
// the in table spec
final DataTableSpec inSpec = in.getDataTableSpec();
// create column re-arranger to overwrite cells corresponding to outliers
final ColumnRearranger colRearranger = new ColumnRearranger(inSpec);
// store the positions where the outlier column names can be found in the input table
final int[] outlierIndices = calculateOutlierIndicies(inSpec);
final DataColumnSpec[] outlierSpecs = new DataColumnSpec[noOutliers];
for (int i = 0; i < noOutliers; i++) {
outlierSpecs[i] = inSpec.getColumnSpec(outlierIndices[i]);
}
// values are copied anyways by the re-arranger so there is no need to
// create new instances for each row
final DataCell[] treatedVals = new DataCell[noOutliers];
final AbstractCellFactory fac = new AbstractCellFactory(true, outlierSpecs) {
@Override
public DataCell[] getCells(final DataRow row) {
final GroupKey key = outlierModel.getKey(row, inSpec);
final Map<String, double[]> colsMap = outlierModel.getGroupIntervals(key);
for (int i = 0; i < noOutliers; i++) {
final DataCell curCell = row.getCell(outlierIndices[i]);
final DataCell treatedCell;
final String outlierColName = m_outlierColNames[i];
if (!curCell.isMissing()) {
// if the key exists treat the value otherwise we process an unkown group
if (colsMap != null) {
// increment the member counter
memberCounter.incrementMemberCount(outlierColName, key);
// treat the value of the cell if its a outlier
treatedCell = treatCellValue(colsMap.get(outlierColName), curCell);
} else {
missingGroupsCounter.incrementMemberCount(outlierColName, key);
treatedCell = curCell;
}
} else {
treatedCell = curCell;
}
// if we changed the value this is an outlier
if (!treatedCell.equals(curCell)) {
outlierRepCounter.incrementMemberCount(outlierColName, key);
}
// update the domain if necessary
if (m_updateDomain && !treatedCell.isMissing()) {
m_domainUpdater.updateDomain(outlierColName, ((DoubleValue) treatedCell).getDoubleValue());
}
treatedVals[i] = treatedCell;
}
return treatedVals;
}
};
// replace the outlier columns by their updated versions
colRearranger.replace(fac, outlierIndices);
// stream it
colRearranger.createStreamableFunction().runFinal(new PortInput[] { in }, new PortOutput[] { out }, exec);
exec.setProgress(1);
}
use of org.knime.core.data.container.AbstractCellFactory in project knime-core by knime.
the class DateTimeDifferenceNodeModel method createColumnRearranger.
private ColumnRearranger createColumnRearranger(final DataTableSpec spec) throws InvalidSettingsException {
final ColumnRearranger rearranger = new ColumnRearranger(spec);
final ZonedDateTime fixedDateTime;
if (m_modusSelectModel.getStringValue().equals(ModusOptions.UseExecutionTime.name())) {
fixedDateTime = ZonedDateTime.now();
} else if (m_modusSelectModel.getStringValue().equals(ModusOptions.UseFixedTime.name())) {
fixedDateTime = m_fixedDateTimeModel.getZonedDateTime();
} else {
fixedDateTime = null;
}
final int colIdx1 = spec.findColumnIndex(m_col1stSelectModel.getStringValue());
final int colIdx2 = spec.findColumnIndex(m_col2ndSelectModel.getStringValue());
final AbstractCellFactory cellFac;
final DataType type = spec.getColumnSpec(colIdx1).getType();
if (type.isCompatible(LocalDateValue.class)) {
cellFac = new DateDifferenceCellFactory(colIdx1, colIdx2, fixedDateTime == null ? null : fixedDateTime.toLocalDate(), createColumnSpec(spec));
} else {
cellFac = new TimeDifferenceCellFactory(colIdx1, colIdx2, fixedDateTime, createColumnSpec(spec));
}
rearranger.append(cellFac);
return rearranger;
}
use of org.knime.core.data.container.AbstractCellFactory in project knime-core by knime.
the class AppendVariableToTableNodeModel method createColumnRearranger.
private ColumnRearranger createColumnRearranger(final DataTableSpec spec) throws InvalidSettingsException {
ColumnRearranger arranger = new ColumnRearranger(spec);
Set<String> nameHash = new HashSet<String>();
for (DataColumnSpec c : spec) {
nameHash.add(c.getName());
}
List<Pair<String, FlowVariable.Type>> vars;
if (m_settings.getIncludeAll()) {
vars = getAllVariables();
} else {
vars = m_settings.getVariablesOfInterest();
}
if (vars.isEmpty()) {
throw new InvalidSettingsException("No variables selected");
}
DataColumnSpec[] specs = new DataColumnSpec[vars.size()];
final DataCell[] values = new DataCell[vars.size()];
for (int i = 0; i < vars.size(); i++) {
Pair<String, FlowVariable.Type> c = vars.get(i);
String name = c.getFirst();
DataType type;
switch(c.getSecond()) {
case DOUBLE:
type = DoubleCell.TYPE;
try {
double dValue = peekFlowVariableDouble(name);
values[i] = new DoubleCell(dValue);
} catch (NoSuchElementException e) {
throw new InvalidSettingsException("No such flow variable (of type double): " + name);
}
break;
case INTEGER:
type = IntCell.TYPE;
try {
int iValue = peekFlowVariableInt(name);
values[i] = new IntCell(iValue);
} catch (NoSuchElementException e) {
throw new InvalidSettingsException("No such flow variable (of type int): " + name);
}
break;
case STRING:
type = StringCell.TYPE;
try {
String sValue = peekFlowVariableString(name);
sValue = sValue == null ? "" : sValue;
values[i] = new StringCell(sValue);
} catch (NoSuchElementException e) {
throw new InvalidSettingsException("No such flow variable (of type String): " + name);
}
break;
default:
throw new InvalidSettingsException("Unsupported variable type: " + c.getSecond());
}
if (nameHash.contains(name) && !name.toLowerCase().endsWith("(variable)")) {
name = name.concat(" (variable)");
}
String newName = name;
int uniquifier = 1;
while (!nameHash.add(newName)) {
newName = name + " (#" + (uniquifier++) + ")";
}
specs[i] = new DataColumnSpecCreator(newName, type).createSpec();
}
arranger.append(new AbstractCellFactory(specs) {
/**
* {@inheritDoc}
*/
@Override
public DataCell[] getCells(final DataRow row) {
return values;
}
});
return arranger;
}
use of org.knime.core.data.container.AbstractCellFactory in project knime-core by knime.
the class KnnNodeModel method createRearranger.
/*
* @param maxRows - can be -1 if can't be determined (streaming)
*/
private ColumnRearranger createRearranger(final DataTableSpec in, final DataColumnSpec classColumnSpec, final List<Integer> featureColumns, final Map<Integer, Integer> firstToSecond, final KDTree<DataCell> tree, final double maxRows) {
ColumnRearranger c = new ColumnRearranger(in);
String newName = "Class [kNN]";
while (in.containsName(newName)) {
newName += "_dup";
}
List<DataColumnSpec> colSpecs = new ArrayList<DataColumnSpec>();
DataColumnSpecCreator crea = new DataColumnSpecCreator(classColumnSpec);
crea.setName(newName);
colSpecs.add(crea.createSpec());
final DataCell[] possibleValues;
if (m_settings.outputClassProbabilities()) {
possibleValues = classColumnSpec.getDomain().getValues().toArray(new DataCell[0]);
Arrays.sort(possibleValues, new Comparator<DataCell>() {
@Override
public int compare(final DataCell o1, final DataCell o2) {
return o1.toString().compareTo(o2.toString());
}
});
for (DataCell posVal : possibleValues) {
newName = posVal.toString();
while (in.containsName(newName)) {
newName += "_dup";
}
crea = new DataColumnSpecCreator(newName, DoubleCell.TYPE);
colSpecs.add(crea.createSpec());
}
} else {
possibleValues = new DataCell[0];
}
final DataColumnSpec[] colSpecArray = colSpecs.toArray(new DataColumnSpec[colSpecs.size()]);
c.append(new AbstractCellFactory(colSpecArray) {
/**
* {@inheritDoc}
*/
@Override
public void setProgress(final long curRowNr, final long rowCount, final RowKey lastKey, final ExecutionMonitor exec) {
if (maxRows > 0) {
exec.setProgress(curRowNr / maxRows, "Classifying row " + lastKey);
} else {
exec.setProgress("Classifying row " + lastKey);
}
}
@Override
public DataCell[] getCells(final DataRow row) {
List<DataCell> output = classify(row, tree, featureColumns, firstToSecond, possibleValues);
return output.toArray(new DataCell[output.size()]);
}
});
return c;
}
Aggregations