Search in sources :

Example 51 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class WekaUtils method instanceListToArffFileMultiLabel.

/**
 * Converts a feature store to a list of instances. Multi-label case.
 *
 * @param outputFile
 *            the output file
 * @param instances
 *            the instances to convert
 * @param useDenseInstances
 *            dense features
 * @param useWeights
 *            use weights
 * @throws Exception
 *             in case of errors
 */
public static void instanceListToArffFileMultiLabel(File outputFile, List<Instance> instances, boolean useDenseInstances, boolean useWeights) throws Exception {
    // Filter preprocessingFilter = new ReplaceMissingValuesWithZeroFilter();
    AttributeStore attributeStore = WekaFeatureEncoder.getAttributeStore(instances);
    List<String> outcomes = new ArrayList<>();
    for (Instance i : instances) {
        outcomes.add(i.getOutcome());
    }
    List<Attribute> outcomeAttributes = createOutcomeAttributes(new ArrayList<String>(outcomes));
    // in Meka, class label attributes have to go on top
    for (Attribute attribute : outcomeAttributes) {
        attributeStore.addAttributeAtBegin(attribute.name(), attribute);
    }
    // for Meka-internal use
    Instances wekaInstances = new Instances(RELATION_NAME + ": -C " + outcomeAttributes.size() + " ", attributeStore.getAttributes(), instances.size());
    wekaInstances.setClassIndex(outcomeAttributes.size());
    if (!outputFile.exists()) {
        outputFile.mkdirs();
        outputFile.createNewFile();
    }
    ArffSaver saver = new ArffSaver();
    // preprocessingFilter.setInputFormat(wekaInstances);
    saver.setRetrieval(Saver.INCREMENTAL);
    saver.setFile(outputFile);
    saver.setCompressOutput(true);
    saver.setInstances(wekaInstances);
    for (int i = 0; i < instances.size(); i++) {
        Instance instance = instances.get(i);
        double[] featureValues = getFeatureValues(attributeStore, instance);
        // set class label values
        List<String> instanceOutcome = instance.getOutcomes();
        for (Attribute label : outcomeAttributes) {
            String labelname = label.name();
            featureValues[attributeStore.getAttributeOffset(labelname)] = instanceOutcome.contains(labelname.split(CLASS_ATTRIBUTE_PREFIX)[1]) ? 1.0d : 0.0d;
        }
        weka.core.Instance wekaInstance;
        if (useDenseInstances) {
            wekaInstance = new DenseInstance(1.0, featureValues);
        } else {
            wekaInstance = new SparseInstance(1.0, featureValues);
        }
        wekaInstance.setDataset(wekaInstances);
        Double instanceWeight = instance.getWeight();
        if (useWeights) {
            wekaInstance.setWeight(instanceWeight);
        }
        // preprocessingFilter.input(wekaInstance);
        // saver.writeIncremental(preprocessingFilter.output());
        saver.writeIncremental(wekaInstance);
    }
    // finishes the incremental saving process
    saver.writeIncremental(null);
}
Also used : DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) Instance(org.dkpro.tc.api.features.Instance) Attribute(weka.core.Attribute) ArrayList(java.util.ArrayList) ArffSaver(weka.core.converters.ArffSaver) Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances)

Example 52 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class MekaDataWriter method transformFromGeneric.

@Override
public void transformFromGeneric() throws Exception {
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(outputFolder, GENERIC_FEATURE_FILE)), "utf-8"));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Instance[] restoredInstance = gson.fromJson(line, Instance[].class);
        writeClassifierFormat(Arrays.asList(restoredInstance));
    }
    reader.close();
    FileUtils.deleteQuietly(new File(outputFolder, GENERIC_FEATURE_FILE));
}
Also used : InputStreamReader(java.io.InputStreamReader) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) Instance(org.dkpro.tc.api.features.Instance) BufferedReader(java.io.BufferedReader) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 53 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class MekaDataWriter method writeClassifierFormat.

@Override
public void writeClassifierFormat(Collection<Instance> instances) throws AnalysisEngineProcessException {
    try {
        Instances masterInstance = initalConfiguration(instances);
        for (Instance instance : instances) {
            double[] featureValues = getFeatureValues(attributeStore, instance);
            // set class label values
            List<String> instanceOutcome = instance.getOutcomes();
            for (Attribute label : outcomeAttributes) {
                String labelname = label.name();
                featureValues[attributeStore.getAttributeOffset(labelname)] = instanceOutcome.contains(labelname.split(CLASS_ATTRIBUTE_PREFIX)[1]) ? 1.0d : 0.0d;
            }
            weka.core.Instance wekaInstance;
            if (useSparse) {
                wekaInstance = new SparseInstance(1.0, featureValues);
            } else {
                wekaInstance = new DenseInstance(1.0, featureValues);
            }
            wekaInstance.setDataset(masterInstance);
            Double instanceWeight = instance.getWeight();
            if (applyWeighting) {
                wekaInstance.setWeight(instanceWeight);
            }
            saver.writeIncremental(wekaInstance);
        }
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
}
Also used : DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) Instance(org.dkpro.tc.api.features.Instance) Attribute(weka.core.Attribute) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) IOException(java.io.IOException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) Instances(weka.core.Instances)

Example 54 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class WekaDataWriter method writeClassifierFormat.

@Override
public void writeClassifierFormat(Collection<Instance> instances) throws AnalysisEngineProcessException {
    try {
        Instances masterInstance = initalConfiguration(instances);
        for (Instance inst : instances) {
            double[] featureValues = getFeatureValues(attributeStore, inst);
            weka.core.Instance wekaInstance;
            if (useSparse) {
                wekaInstance = new SparseInstance(1.0, featureValues);
            } else {
                wekaInstance = new DenseInstance(1.0, featureValues);
            }
            wekaInstance.setDataset(masterInstance);
            String outcome = inst.getOutcome();
            if (isRegression) {
                wekaInstance.setClassValue(Double.parseDouble(outcome));
            } else {
                wekaInstance.setClassValue(outcome);
            }
            Double instanceWeight = inst.getWeight();
            if (applyWeighting) {
                wekaInstance.setWeight(instanceWeight);
            }
            // preprocessingFilter.input(wekaInstance);
            // saver.writeIncremental(preprocessingFilter.output());
            saver.writeIncremental(wekaInstance);
        }
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
}
Also used : Instances(weka.core.Instances) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) Instance(org.dkpro.tc.api.features.Instance) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) IOException(java.io.IOException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException)

Example 55 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class WekaDataWriter method transformFromGeneric.

@Override
public void transformFromGeneric() throws Exception {
    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File(outputFolder, GENERIC_FEATURE_FILE)), "utf-8"));
    String line = null;
    while ((line = reader.readLine()) != null) {
        Instance[] restoredInstances = gson.fromJson(line, Instance[].class);
        writeClassifierFormat(Arrays.asList(restoredInstances));
    }
    reader.close();
    FileUtils.deleteQuietly(new File(outputFolder, GENERIC_FEATURE_FILE));
}
Also used : InputStreamReader(java.io.InputStreamReader) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) Instance(org.dkpro.tc.api.features.Instance) BufferedReader(java.io.BufferedReader) File(java.io.File) FileInputStream(java.io.FileInputStream)

Aggregations

Instance (org.dkpro.tc.api.features.Instance)61 ArrayList (java.util.ArrayList)38 Feature (org.dkpro.tc.api.features.Feature)30 Test (org.junit.Test)27 File (java.io.File)17 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)14 IOException (java.io.IOException)12 Gson (com.google.gson.Gson)8 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)8 Attribute (weka.core.Attribute)8 DenseInstance (weka.core.DenseInstance)8 Instances (weka.core.Instances)8 SparseInstance (weka.core.SparseInstance)8 FeatureExtractorResource_ImplBase (org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)6 BufferedReader (java.io.BufferedReader)5 FileInputStream (java.io.FileInputStream)5 InputStreamReader (java.io.InputStreamReader)5 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)5 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)5 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)5