use of org.dmg.pmml.DerivedFieldDocument.DerivedField in project knime-core by knime.
the class PMMLPortObject method addGlobalTransformations.
/**
* Adds global transformations to the PMML document. Only DerivedField
* elements are supported so far. If no global transformations are set so
* far the dictionary is set as new transformation dictionary, otherwise
* all contained transformations are appended to the existing one.
*
* @param dictionary the transformation dictionary that contains the
* transformations to be added
*/
public void addGlobalTransformations(final TransformationDictionary dictionary) {
// add the transformations to the TransformationDictionary
if (dictionary.getDefineFunctionArray().length > 0) {
throw new IllegalArgumentException("DefineFunctions are not " + "supported so far. Only derived fields are allowed.");
}
TransformationDictionary dict = m_pmmlDoc.getPMML().getTransformationDictionary();
if (dict == null) {
m_pmmlDoc.getPMML().setTransformationDictionary(dictionary);
dict = m_pmmlDoc.getPMML().getTransformationDictionary();
} else {
// append the transformations to the existing dictionary
DerivedField[] existingFields = dict.getDerivedFieldArray();
DerivedField[] result = appendDerivedFields(existingFields, dictionary.getDerivedFieldArray());
dict.setDerivedFieldArray(result);
}
DerivedField[] df = dict.getDerivedFieldArray();
List<String> colNames = new ArrayList<String>(df.length);
Set<String> dfNames = new HashSet<String>();
for (int i = 0; i < df.length; i++) {
String derivedName = df[i].getName();
if (dfNames.contains(derivedName)) {
throw new IllegalArgumentException("Derived field name \"" + derivedName + "\" is not unique.");
}
dfNames.add(derivedName);
String displayName = df[i].getDisplayName();
colNames.add(displayName == null ? derivedName : displayName);
}
/* Remove data fields from data dictionary that where created as a
* derived field. In KNIME the origin of columns is not distinguished
* and all columns are added to the data dictionary. But in PMML this
* results in duplicate entries. Those columns should only appear once
* as derived field in the transformation dictionary or local
* transformations. */
DataDictionary dataDict = m_pmmlDoc.getPMML().getDataDictionary();
DataField[] dataFieldArray = dataDict.getDataFieldArray();
List<DataField> dataFields = new ArrayList<DataField>(Arrays.asList(dataFieldArray));
for (DataField dataField : dataFieldArray) {
if (dfNames.contains(dataField.getName())) {
dataFields.remove(dataField);
}
}
dataDict.setDataFieldArray(dataFields.toArray(new DataField[0]));
// update the number of fields
dataDict.setNumberOfFields(BigInteger.valueOf(dataFields.size()));
// -------------------------------------------------
// update field names in the model if applicable
DerivedFieldMapper dfm = new DerivedFieldMapper(df);
Map<String, String> derivedFieldMap = dfm.getDerivedFieldMap();
/* Use XPATH to update field names in the model and move the derived
* fields to local transformations. */
PMML pmml = m_pmmlDoc.getPMML();
if (pmml.getTreeModelArray().length > 0) {
fixAttributeAtPath(pmml, TREE_PATH, FIELD, derivedFieldMap);
} else if (pmml.getClusteringModelArray().length > 0) {
fixAttributeAtPath(pmml, CLUSTERING_PATH, FIELD, derivedFieldMap);
} else if (pmml.getNeuralNetworkArray().length > 0) {
fixAttributeAtPath(pmml, NN_PATH, FIELD, derivedFieldMap);
} else if (pmml.getSupportVectorMachineModelArray().length > 0) {
fixAttributeAtPath(pmml, SVM_PATH, FIELD, derivedFieldMap);
} else if (pmml.getRegressionModelArray().length > 0) {
fixAttributeAtPath(pmml, REGRESSION_PATH_1, FIELD, derivedFieldMap);
fixAttributeAtPath(pmml, REGRESSION_PATH_2, NAME, derivedFieldMap);
} else if (pmml.getGeneralRegressionModelArray().length > 0) {
fixAttributeAtPath(pmml, GR_PATH_1, NAME, derivedFieldMap);
fixAttributeAtPath(pmml, GR_PATH_2, LABEL, derivedFieldMap);
fixAttributeAtPath(pmml, GR_PATH_3, PREDICTOR_NAME, derivedFieldMap);
}
// else do nothing as no model exists yet
// --------------------------------------------------
PMMLPortObjectSpecCreator creator = new PMMLPortObjectSpecCreator(this, m_spec.getDataTableSpec());
creator.addPreprocColNames(colNames);
m_spec = creator.createSpec();
}
use of org.dmg.pmml.DerivedFieldDocument.DerivedField in project knime-core by knime.
the class PMMLBinningTranslator method createDerivedFields.
private DerivedField[] createDerivedFields() {
final int num = m_columnToBins.size();
final DerivedField[] derivedFields = new DerivedField[num];
int i = 0;
for (Map.Entry<String, Bin[]> entry : m_columnToBins.entrySet()) {
final Bin[] bins = entry.getValue();
final DerivedField df = DerivedField.Factory.newInstance();
final String name = entry.getKey();
/* The field name must be retrieved before creating a new derived
* name for this derived field as the map only contains the
* current mapping. */
final String fieldName = m_mapper.getDerivedFieldName(name);
final Discretize dis = df.addNewDiscretize();
dis.setField(fieldName);
final String derivedName = m_columnToAppend.get(name);
if (derivedName != null) {
df.setName(derivedName);
} else {
df.setName(m_mapper.createDerivedFieldName(name));
df.setDisplayName(name);
}
df.setOptype(OPTYPE.CATEGORICAL);
df.setDataType(DATATYPE.STRING);
for (Bin bin : bins) {
final NumericBin knimeBin = (NumericBin) bin;
final boolean leftOpen = knimeBin.isLeftOpen();
final boolean rightOpen = knimeBin.isRightOpen();
final double leftValue = knimeBin.getLeftValue();
final double rightValue = knimeBin.getRightValue();
final DiscretizeBin pmmlBin = dis.addNewDiscretizeBin();
pmmlBin.setBinValue(knimeBin.getBinName());
final Interval interval = pmmlBin.addNewInterval();
if (!Double.isInfinite(leftValue)) {
interval.setLeftMargin(leftValue);
}
if (!Double.isInfinite(rightValue)) {
interval.setRightMargin(rightValue);
}
if (leftOpen && rightOpen) {
interval.setClosure(Closure.OPEN_OPEN);
} else if (leftOpen && !rightOpen) {
interval.setClosure(Closure.OPEN_CLOSED);
} else if (!leftOpen && rightOpen) {
interval.setClosure(Closure.CLOSED_OPEN);
} else if (!leftOpen && !rightOpen) {
interval.setClosure(Closure.CLOSED_CLOSED);
}
}
derivedFields[i++] = df;
}
return derivedFields;
}
use of org.dmg.pmml.DerivedFieldDocument.DerivedField in project knime-core by knime.
the class PMMLStringConversionTranslator method createDerivedFields.
private DerivedField[] createDerivedFields() {
DATATYPE.Enum dataType = PMMLDataDictionaryTranslator.getPMMLDataType(m_parseType);
OPTYPE.Enum optype = PMMLDataDictionaryTranslator.getOptype(m_parseType);
int num = m_includeCols.size();
DerivedField[] derivedFields = new DerivedField[num];
for (int i = 0; i < num; i++) {
DerivedField df = DerivedField.Factory.newInstance();
String name = m_includeCols.get(i);
df.setDisplayName(name);
/* The field name must be retrieved before creating a new derived
* name for this derived field as the map only contains the
* current mapping. */
String fieldName = m_mapper.getDerivedFieldName(name);
df.setName(m_mapper.createDerivedFieldName(name));
df.setDataType(dataType);
df.setOptype(optype);
FieldRef fieldRef = df.addNewFieldRef();
fieldRef.setField(fieldName);
derivedFields[i] = df;
}
return derivedFields;
}
use of org.dmg.pmml.DerivedFieldDocument.DerivedField in project knime-core by knime.
the class PMMLStringConversionTranslator method initializeFrom.
/**
* {@inheritDoc}
*/
@Override
@SuppressWarnings("unchecked")
public List<Integer> initializeFrom(final DerivedField[] derivedFields) {
if (derivedFields == null) {
return Collections.EMPTY_LIST;
}
int num = derivedFields.length;
List<Integer> consumed = new ArrayList<Integer>(num);
for (int i = 0; i < derivedFields.length; i++) {
DerivedField df = derivedFields[i];
/**
* This field contains the name of the column in KNIME that
* corresponds to the derived field in PMML. This is necessary if
* derived fields are defined on other derived fields and the
* columns in KNIME are replaced with the preprocessed values.
* In this case KNIME has to know the original names (e.g. A) while
* PMML references to A*, A** etc.
*/
String displayName = df.getDisplayName();
if (!df.isSetFieldRef()) {
// only reading field references
continue;
}
DataType dataType = PMMLDataDictionaryTranslator.getKNIMEDataType(df.getDataType());
if (dataType.isCompatible(IntValue.class)) {
m_parseType = IntCell.TYPE;
} else if (dataType.isCompatible(DoubleValue.class)) {
m_parseType = DoubleCell.TYPE;
} else if (dataType == StringCell.TYPE) {
m_parseType = StringCell.TYPE;
} else {
// only processing int, double and string conversions
continue;
}
FieldRef fieldRef = df.getFieldRef();
if (displayName != null) {
m_includeCols.add(displayName);
} else {
m_includeCols.add(m_mapper.getColumnName(fieldRef.getField()));
}
consumed.add(i);
}
return consumed;
}
use of org.dmg.pmml.DerivedFieldDocument.DerivedField in project knime-core by knime.
the class DataColumnSpecFilterPMMLNodeModel method createPMMLOut.
private PMMLPortObject createPMMLOut(final PMMLPortObject pmmlIn, final DataTableSpec outSpec, final FilterResult res) throws XmlException {
StringBuffer warningBuffer = new StringBuffer();
if (pmmlIn == null) {
return new PMMLPortObject(createPMMLSpec(null, outSpec, res));
} else {
PMMLDocument pmmldoc;
try (LockedSupplier<Document> supplier = pmmlIn.getPMMLValue().getDocumentSupplier()) {
pmmldoc = PMMLDocument.Factory.parse(supplier.get());
}
// Inspect models to check if they use any excluded columns
List<PMMLModelWrapper> models = PMMLModelWrapper.getModelListFromPMMLDocument(pmmldoc);
for (PMMLModelWrapper model : models) {
MiningSchema ms = model.getMiningSchema();
for (MiningField mf : ms.getMiningFieldList()) {
if (isExcluded(mf.getName(), res)) {
if (warningBuffer.length() != 0) {
warningBuffer.append("\n");
}
warningBuffer.append(model.getModelType().name() + " uses excluded column " + mf.getName());
}
}
}
ArrayList<String> warningFields = new ArrayList<String>();
PMML pmml = pmmldoc.getPMML();
// Now check the transformations if they exist
if (pmml.getTransformationDictionary() != null) {
for (DerivedField df : pmml.getTransformationDictionary().getDerivedFieldList()) {
FieldRef fr = df.getFieldRef();
if (fr != null && isExcluded(fr.getField(), res)) {
warningFields.add(fr.getField());
}
Aggregate a = df.getAggregate();
if (a != null && isExcluded(a.getField(), res)) {
warningFields.add(a.getField());
}
Apply ap = df.getApply();
if (ap != null) {
for (FieldRef fieldRef : ap.getFieldRefList()) {
if (isExcluded(fieldRef.getField(), res)) {
warningFields.add(fieldRef.getField());
break;
}
}
}
Discretize d = df.getDiscretize();
if (d != null && isExcluded(d.getField(), res)) {
warningFields.add(d.getField());
}
MapValues mv = df.getMapValues();
if (mv != null) {
for (FieldColumnPair fcp : mv.getFieldColumnPairList()) {
if (isExcluded(fcp.getField(), res)) {
warningFields.add(fcp.getField());
}
}
}
NormContinuous nc = df.getNormContinuous();
if (nc != null && isExcluded(nc.getField(), res)) {
warningFields.add(nc.getField());
}
NormDiscrete nd = df.getNormDiscrete();
if (nd != null && isExcluded(nd.getField(), res)) {
warningFields.add(nd.getField());
}
}
}
DataDictionary dict = pmml.getDataDictionary();
List<DataField> fields = dict.getDataFieldList();
// Apply filter to spec
int numFields = 0;
for (int i = fields.size() - 1; i >= 0; i--) {
if (isExcluded(fields.get(i).getName(), res)) {
dict.removeDataField(i);
} else {
numFields++;
}
}
dict.setNumberOfFields(new BigInteger(Integer.toString(numFields)));
pmml.setDataDictionary(dict);
pmmldoc.setPMML(pmml);
// generate warnings and set as warning message
for (String s : warningFields) {
if (warningBuffer.length() != 0) {
warningBuffer.append("\n");
}
warningBuffer.append("Transformation dictionary uses excluded column " + s);
}
if (warningBuffer.length() > 0) {
setWarningMessage(warningBuffer.toString().trim());
}
PMMLPortObject outport = null;
try {
outport = new PMMLPortObject(createPMMLSpec(pmmlIn.getSpec(), outSpec, res), pmmldoc);
} catch (IllegalArgumentException e) {
if (res.getIncludes().length == 0) {
throw new IllegalArgumentException("Excluding all columns produces invalid PMML", e);
} else {
throw e;
}
}
return outport;
}
}
Aggregations