Search in sources :

Example 16 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtils method removeInstanceId.

/**
 * Removes the instanceId attribute, iff present
 *
 * @param data
 *            data set with or without instanceId attribute
 * @param multilabel
 *            is multi label processing
 * @return the data set without instanceId attribute
 * @throws Exception
 *             an exception
 */
public static Instances removeInstanceId(Instances data, boolean multilabel) throws Exception {
    Instances filteredData;
    int classIndex = data.classIndex();
    if (data.attribute(Constants.ID_FEATURE_NAME) != null) {
        int instanceIdOffset = data.attribute(Constants.ID_FEATURE_NAME).index();
        Remove remove = new Remove();
        remove.setAttributeIndices(Integer.toString(instanceIdOffset + 1));
        remove.setInvertSelection(false);
        remove.setInputFormat(data);
        filteredData = Filter.useFilter(data, remove);
    } else {
        filteredData = new Instances(data);
    }
    // make sure the class index gets retained in multi-label
    if (multilabel) {
        filteredData.setClassIndex(classIndex);
    }
    return filteredData;
}
Also used : Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances) Remove(weka.filters.unsupervised.attribute.Remove)

Example 17 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtils method addInstanceId.

/**
 * Copies the instanceId attribute and its values from an existing data set, iff present. It
 * will be indexed right before the class attribute
 *
 * @param newData
 *            data set without instanceId attribute
 * @param oldData
 *            data set with or without instanceId attribute
 * @param isMultilabel
 *            is multi label processing
 * @return a data set with or without instanceId attribute
 * @throws Exception
 *             an exception
 */
public static Instances addInstanceId(Instances newData, Instances oldData, boolean isMultilabel) throws Exception {
    Instances filteredData;
    if (oldData.attribute(Constants.ID_FEATURE_NAME) != null) {
        int instanceIdOffset = oldData.attribute(Constants.ID_FEATURE_NAME).index();
        Add add = new Add();
        add.setAttributeName(Constants.ID_FEATURE_NAME);
        // for single-label
        if (isMultilabel) {
            add.setAttributeIndex("last");
        } else {
            add.setAttributeIndex("first");
        }
        add.setAttributeType(new SelectedTag(Attribute.STRING, Add.TAGS_TYPE));
        add.setInputFormat(newData);
        filteredData = Filter.useFilter(newData, add);
        int j = isMultilabel ? filteredData.numAttributes() - 1 : 0;
        for (int i = 0; i < filteredData.numInstances(); i++) {
            String outcomeId = oldData.instance(i).stringValue(instanceIdOffset);
            filteredData.instance(i).setValue(j, outcomeId);
        }
    } else {
        filteredData = new Instances(newData);
    }
    return filteredData;
}
Also used : Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances) Add(weka.filters.unsupervised.attribute.Add) SelectedTag(weka.core.SelectedTag)

Example 18 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class MekaDataWriter method initalConfiguration.

private Instances initalConfiguration(Collection<Instance> instances) throws TextClassificationException, IOException {
    if (saver != null) {
        return masterInstance;
    }
    saver = new ArffSaver();
    saver.setRetrieval(Saver.INCREMENTAL);
    saver.setFile(arffTarget);
    saver.setCompressOutput(false);
    attributeStore = new AttributeStore();
    List<String> lines = FileUtils.readLines(new File(outputFolder, Constants.FILENAME_FEATURES_DESCRIPTION), "utf-8");
    for (String l : lines) {
        String[] split = l.split("\t");
        String featureName = split[0];
        if (!attributeStore.containsAttributeName(featureName)) {
            FeatureType type = FeatureType.valueOf(split[1]);
            String enumType = null;
            if (type == FeatureType.NOMINAL) {
                enumType = split[2];
            }
            Attribute attribute = WekaFeatureEncoder.featureToAttributeUsingFeatureDescription(featureName, type, enumType);
            attributeStore.addAttribute(featureName, attribute);
        }
    }
    // Make sure "outcome" is not the name of an attribute
    List<String> outcomeList = Arrays.asList(outcomes);
    outcomeAttributes = createOutcomeAttributes(outcomeList);
    // in Meka, class label attributes have to go on top
    for (Attribute attribute : outcomeAttributes) {
        attributeStore.addAttributeAtBegin(attribute.name(), attribute);
    }
    // for Meka-internal use
    masterInstance = new Instances(WekaUtils.RELATION_NAME + ": -C " + outcomeAttributes.size() + " ", attributeStore.getAttributes(), instances.size());
    masterInstance.setClassIndex(outcomeAttributes.size());
    saver.setInstances(masterInstance);
    return masterInstance;
}
Also used : AttributeStore(org.dkpro.tc.ml.weka.util.AttributeStore) Instances(weka.core.Instances) FeatureType(org.dkpro.tc.api.features.FeatureType) Attribute(weka.core.Attribute) ArffSaver(weka.core.converters.ArffSaver) File(java.io.File)

Example 19 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class MekaDataWriter method writeClassifierFormat.

@Override
public void writeClassifierFormat(Collection<Instance> instances) throws AnalysisEngineProcessException {
    try {
        Instances masterInstance = initalConfiguration(instances);
        for (Instance instance : instances) {
            double[] featureValues = getFeatureValues(attributeStore, instance);
            // set class label values
            List<String> instanceOutcome = instance.getOutcomes();
            for (Attribute label : outcomeAttributes) {
                String labelname = label.name();
                featureValues[attributeStore.getAttributeOffset(labelname)] = instanceOutcome.contains(labelname.split(CLASS_ATTRIBUTE_PREFIX)[1]) ? 1.0d : 0.0d;
            }
            weka.core.Instance wekaInstance;
            if (useSparse) {
                wekaInstance = new SparseInstance(1.0, featureValues);
            } else {
                wekaInstance = new DenseInstance(1.0, featureValues);
            }
            wekaInstance.setDataset(masterInstance);
            Double instanceWeight = instance.getWeight();
            if (applyWeighting) {
                wekaInstance.setWeight(instanceWeight);
            }
            saver.writeIncremental(wekaInstance);
        }
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
}
Also used : DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) Instance(org.dkpro.tc.api.features.Instance) Attribute(weka.core.Attribute) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) IOException(java.io.IOException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) Instances(weka.core.Instances)

Example 20 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaDataWriter method initalConfiguration.

private Instances initalConfiguration(Collection<Instance> instances) throws TextClassificationException, IOException {
    if (saver != null) {
        return masterInstance;
    }
    saver = new ArffSaver();
    saver.setRetrieval(Saver.INCREMENTAL);
    saver.setFile(arffTarget);
    saver.setCompressOutput(false);
    attributeStore = new AttributeStore();
    List<String> lines = FileUtils.readLines(new File(outputFolder, Constants.FILENAME_FEATURES_DESCRIPTION), "utf-8");
    for (String l : lines) {
        String[] split = l.split("\t");
        String featureName = split[0];
        if (!attributeStore.containsAttributeName(featureName)) {
            FeatureType type = FeatureType.valueOf(split[1]);
            String enumType = null;
            if (type == FeatureType.NOMINAL) {
                enumType = split[2];
            }
            Attribute attribute = WekaFeatureEncoder.featureToAttributeUsingFeatureDescription(featureName, type, enumType);
            attributeStore.addAttribute(featureName, attribute);
        }
    }
    // Make sure "outcome" is not the name of an attribute
    List<String> outcomeList = Arrays.asList(outcomes);
    outcomeAttribute = createOutcomeAttribute(outcomeList, isRegression);
    if (attributeStore.containsAttributeName(CLASS_ATTRIBUTE_NAME)) {
        System.err.println("A feature with name \"outcome\" was found. Renaming outcome attribute");
        outcomeAttribute = outcomeAttribute.copy(CLASS_ATTRIBUTE_PREFIX + CLASS_ATTRIBUTE_NAME);
    }
    attributeStore.addAttribute(outcomeAttribute.name(), outcomeAttribute);
    masterInstance = new Instances(WekaUtils.RELATION_NAME, attributeStore.getAttributes(), instances.size());
    masterInstance.setClass(outcomeAttribute);
    saver.setInstances(masterInstance);
    return masterInstance;
}
Also used : AttributeStore(org.dkpro.tc.ml.weka.util.AttributeStore) Instances(weka.core.Instances) FeatureType(org.dkpro.tc.api.features.FeatureType) Attribute(weka.core.Attribute) ArffSaver(weka.core.converters.ArffSaver) File(java.io.File)

Aggregations

Instances (weka.core.Instances)31 Attribute (weka.core.Attribute)12 ArrayList (java.util.ArrayList)9 File (java.io.File)8 Instance (org.dkpro.tc.api.features.Instance)8 Test (org.junit.Test)8 MultiLabelInstances (mulan.data.MultiLabelInstances)7 IOException (java.io.IOException)5 DenseInstance (weka.core.DenseInstance)5 Instance (weka.core.Instance)5 ArffSaver (weka.core.converters.ArffSaver)5 Feature (org.dkpro.tc.api.features.Feature)4 Classifier (weka.classifiers.Classifier)3 FastVector (weka.core.FastVector)3 SparseInstance (weka.core.SparseInstance)3 HashMap (java.util.HashMap)2 Result (meka.core.Result)2 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)2 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)2 FeatureType (org.dkpro.tc.api.features.FeatureType)2