Search in sources :

Example 6 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtils method instanceListToArffFile.

/**
 * Converts a feature store to a list of instances. Single-label case.
 *
 * @param outputFile
 *            the output file
 * @param instanceList
 *            the instance list
 * @param useDenseInstances
 *            use dense instances
 * @param isRegressionExperiment
 *            is regression
 * @param useWeights
 *            uses weight
 * @throws Exception
 *             in case of error
 */
public static void instanceListToArffFile(File outputFile, List<Instance> instanceList, boolean useDenseInstances, boolean isRegressionExperiment, boolean useWeights) throws Exception {
    List<String> outcomeList = new ArrayList<>();
    for (Instance i : instanceList) {
        outcomeList.add(i.getOutcome());
    }
    // check for error conditions
    if (outcomeList.isEmpty()) {
        throw new IllegalArgumentException("List of instance outcomes is empty.");
    }
    // Filter preprocessingFilter = new ReplaceMissingValuesWithZeroFilter();
    AttributeStore attributeStore = WekaFeatureEncoder.getAttributeStore(instanceList);
    // Make sure "outcome" is not the name of an attribute
    Attribute outcomeAttribute = createOutcomeAttribute(outcomeList, isRegressionExperiment);
    if (attributeStore.containsAttributeName(CLASS_ATTRIBUTE_NAME)) {
        System.err.println("A feature with name \"outcome\" was found. Renaming outcome attribute");
        outcomeAttribute = outcomeAttribute.copy(CLASS_ATTRIBUTE_PREFIX + CLASS_ATTRIBUTE_NAME);
    }
    attributeStore.addAttribute(outcomeAttribute.name(), outcomeAttribute);
    Instances wekaInstances = new Instances(RELATION_NAME, attributeStore.getAttributes(), instanceList.size());
    wekaInstances.setClass(outcomeAttribute);
    if (!outputFile.exists()) {
        outputFile.mkdirs();
        outputFile.createNewFile();
    }
    ArffSaver saver = new ArffSaver();
    // preprocessingFilter.setInputFormat(wekaInstances);
    saver.setRetrieval(Saver.INCREMENTAL);
    saver.setFile(outputFile);
    saver.setCompressOutput(true);
    saver.setInstances(wekaInstances);
    for (int i = 0; i < instanceList.size(); i++) {
        Instance instance = instanceList.get(i);
        double[] featureValues = getFeatureValues(attributeStore, instance);
        weka.core.Instance wekaInstance;
        if (useDenseInstances) {
            wekaInstance = new DenseInstance(1.0, featureValues);
        } else {
            wekaInstance = new SparseInstance(1.0, featureValues);
        }
        wekaInstance.setDataset(wekaInstances);
        String outcome = outcomeList.get(i);
        if (isRegressionExperiment) {
            wekaInstance.setClassValue(Double.parseDouble(outcome));
        } else {
            wekaInstance.setClassValue(outcome);
        }
        Double instanceWeight = instance.getWeight();
        if (useWeights) {
            wekaInstance.setWeight(instanceWeight);
        }
        // preprocessingFilter.input(wekaInstance);
        // saver.writeIncremental(preprocessingFilter.output());
        saver.writeIncremental(wekaInstance);
    }
    // finishes the incremental saving process
    saver.writeIncremental(null);
}
Also used : DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) Instance(org.dkpro.tc.api.features.Instance) Attribute(weka.core.Attribute) ArrayList(java.util.ArrayList) ArffSaver(weka.core.converters.ArffSaver) Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances)

Example 7 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtils method applyAttributeSelectionFilter.

/**
 * Applies a filter to reduce the dimension of attributes and reorders them to be used within
 * Meka
 *
 * @param trainData
 *            the train data
 * @param removeFilter
 *            remove filter
 * @return weka instances
 * @throws Exception
 *             in case of error
 */
public static Instances applyAttributeSelectionFilter(Instances trainData, Remove removeFilter) throws Exception {
    // less attributes than should be kept => ignore filter
    if (removeFilter == null) {
        return trainData;
    }
    Instances filtered = Filter.useFilter(trainData, removeFilter);
    filtered.setClassIndex(trainData.classIndex());
    // swap attributes to fit MEKA
    MekaClassAttributes attFilter = new MekaClassAttributes();
    attFilter.setAttributeIndices(filtered.numAttributes() - trainData.classIndex() + 1 + "-last");
    attFilter.setInputFormat(filtered);
    filtered = Filter.useFilter(filtered, attFilter);
    int newClassindex = filtered.classIndex();
    filtered.setRelationName(filtered.relationName().replaceAll("\\-C\\s[\\d]+", "-C " + newClassindex));
    return filtered;
}
Also used : Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances) MekaClassAttributes(meka.filters.unsupervised.attribute.MekaClassAttributes)

Example 8 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtils method getInstances.

/**
 * Read instances from uncompressed or compressed arff files. Compression is determined by
 * filename suffix. For bz2 files, it is expected that the first two bytes mark the compression
 * types (BZ) - thus, the first bytes of the stream are skipped. <br>
 * For arff files with single-label outcome, the class attribute is expected at the end of the
 * attribute set. For arff files with multi-label outcome, the class attribute is expected at
 * the beginning of the attribute set; additionally the number of class labels must be specified
 * in the relation tag behind a "-C" argument, e.g. "-C 3".
 *
 * @param instancesFile
 *            arff File
 * @param multiLabel
 *            whether this arff file contains single- or multi-label outcome
 * @return instances with class attribute set
 * @throws FileNotFoundException
 *             if file is not found
 * @throws IOException
 *             if an exception occurs
 */
public static Instances getInstances(File instancesFile, boolean multiLabel) throws FileNotFoundException, IOException {
    FileInputStream fis = new FileInputStream(instancesFile);
    BufferedInputStream bufStr = new BufferedInputStream(fis);
    InputStream underlyingStream = null;
    if (instancesFile.getName().endsWith(".gz")) {
        underlyingStream = new GZIPInputStream(bufStr);
    } else if (instancesFile.getName().endsWith(".bz2")) {
        // skip bzip2 prefix that we added manually
        fis.read();
        fis.read();
        underlyingStream = new CBZip2InputStream(bufStr);
    } else {
        underlyingStream = bufStr;
    }
    Reader reader = new InputStreamReader(underlyingStream, "UTF-8");
    Instances trainData = new Instances(reader);
    if (multiLabel) {
        String relationTag = trainData.relationName();
        // for multi-label classification, class labels are expected at beginning of attribute
        // set and their number must be specified with the -C parameter in the relation tag
        Matcher m = Pattern.compile("-C\\s\\d+").matcher(relationTag);
        m.find();
        trainData.setClassIndex(Integer.parseInt(m.group().split("-C ")[1]));
    } else {
        // for single-label classification, class label expected as last attribute
        trainData.setClassIndex(trainData.numAttributes() - 1);
    }
    reader.close();
    return trainData;
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances) InputStreamReader(java.io.InputStreamReader) BufferedInputStream(java.io.BufferedInputStream) Matcher(java.util.regex.Matcher) GZIPInputStream(java.util.zip.GZIPInputStream) BufferedInputStream(java.io.BufferedInputStream) ObjectInputStream(java.io.ObjectInputStream) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) CBZip2InputStream(org.apache.tools.bzip2.CBZip2InputStream) Reader(java.io.Reader) InputStreamReader(java.io.InputStreamReader) FileInputStream(java.io.FileInputStream)

Example 9 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtils method makeOutcomeClassesCompatible.

/**
 * Adapts the test data class labels to the training data. Class labels from the test data
 * unseen in the training data will be deleted from the test data. Class labels from the
 * training data unseen in the test data will be added to the test data. If training and test
 * class labels are equal, nothing will be done.
 *
 * @param trainData
 *            train data
 * @param testData
 *            test data
 * @param multilabel
 *            is multilable
 * @return instance
 * @throws Exception
 *             in case of error
 */
@SuppressWarnings({ "rawtypes", "unchecked" })
public static Instances makeOutcomeClassesCompatible(Instances trainData, Instances testData, boolean multilabel) throws Exception {
    // new (compatible) test data
    Instances compTestData = null;
    // ================ SINGLE LABEL BRANCH ======================
    if (!multilabel) {
        // retrieve class labels
        Enumeration trainOutcomeValues = trainData.classAttribute().enumerateValues();
        Enumeration testOutcomeValues = testData.classAttribute().enumerateValues();
        ArrayList trainLabels = Collections.list(trainOutcomeValues);
        ArrayList testLabels = Collections.list(testOutcomeValues);
        // add new outcome class attribute to test data
        Add addFilter = new Add();
        addFilter.setNominalLabels(StringUtils.join(trainLabels, ','));
        addFilter.setAttributeName(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS);
        addFilter.setInputFormat(testData);
        testData = Filter.useFilter(testData, addFilter);
        // fill NEW test data with values from old test data plus the new class attribute
        compTestData = new Instances(testData, testData.numInstances());
        for (int i = 0; i < testData.numInstances(); i++) {
            weka.core.Instance instance = testData.instance(i);
            String label = (String) testLabels.get((int) instance.value(testData.classAttribute()));
            if (trainLabels.indexOf(label) != -1) {
                instance.setValue(testData.attribute(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS), label);
            } else {
                instance.setMissing(testData.classIndex());
            }
            compTestData.add(instance);
        }
        // remove old class attribute
        Remove remove = new Remove();
        remove.setAttributeIndices(Integer.toString(compTestData.attribute(Constants.CLASS_ATTRIBUTE_NAME).index() + 1));
        remove.setInvertSelection(false);
        remove.setInputFormat(compTestData);
        compTestData = Filter.useFilter(compTestData, remove);
        // set new class attribute
        compTestData.setClass(compTestData.attribute(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS));
    } else // ================ MULTI LABEL BRANCH ======================
    {
        int numTrainLabels = trainData.classIndex();
        int numTestLabels = testData.classIndex();
        ArrayList<String> trainLabels = getLabels(trainData);
        // ArrayList<String> testLabels = getLabels(testData);
        // add new outcome class attributes to test data
        Add filter = new Add();
        for (int i = 0; i < numTrainLabels; i++) {
            // numTestLabels +i (because index starts from 0)
            filter.setAttributeIndex(Integer.toString(numTestLabels + i + 1));
            filter.setNominalLabels("0,1");
            filter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
            filter.setInputFormat(testData);
            testData = Filter.useFilter(testData, filter);
        }
        // fill NEW test data with values from old test data plus the new class attributes
        compTestData = new Instances(testData, testData.numInstances());
        for (int i = 0; i < testData.numInstances(); i++) {
            weka.core.Instance instance = testData.instance(i);
            // fullfill with 0.
            for (int j = 0; j < numTrainLabels; j++) {
                instance.setValue(j + numTestLabels, 0.);
            }
            // fill the real values:
            for (int j = 0; j < numTestLabels; j++) {
                // part of train data: forget labels which are not part of the train data
                if (trainLabels.indexOf(instance.attribute(j).name()) != -1) {
                    // class label found in test data
                    int index = trainLabels.indexOf(instance.attribute(j).name());
                    instance.setValue(index + numTestLabels, instance.value(j));
                }
            }
            compTestData.add(instance);
        }
        // remove old class attributes
        for (int i = 0; i < numTestLabels; i++) {
            Remove remove = new Remove();
            remove.setAttributeIndices("1");
            remove.setInvertSelection(false);
            remove.setInputFormat(compTestData);
            compTestData = Filter.useFilter(compTestData, remove);
        }
        // adapt header and set new class label
        String relationTag = compTestData.relationName();
        compTestData.setRelationName(relationTag.substring(0, relationTag.indexOf("-C") + 2) + " " + numTrainLabels + " ");
        compTestData.setClassIndex(numTrainLabels);
    }
    return compTestData;
}
Also used : Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances) Add(weka.filters.unsupervised.attribute.Add) Enumeration(java.util.Enumeration) ArrayList(java.util.ArrayList) Remove(weka.filters.unsupervised.attribute.Remove)

Example 10 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaResultsTest method testWekaResultsSingleLabel.

@Test
public void testWekaResultsSingleLabel() throws Exception {
    SMO cl = new SMO();
    Instances testData = WekaUtils.makeOutcomeClassesCompatible(singleLabelTrainData, singleLabelTestData, false);
    Instances trainData = WekaUtils.removeInstanceId(singleLabelTrainData, false);
    testData = WekaUtils.removeInstanceId(testData, false);
    cl.buildClassifier(trainData);
    Evaluation eval = WekaUtils.getEvaluationSinglelabel(cl, trainData, testData);
    assertEquals(7.0, eval.correct(), 0.01);
}
Also used : Instances(weka.core.Instances) Evaluation(weka.classifiers.Evaluation) SMO(weka.classifiers.functions.SMO) Test(org.junit.Test)

Aggregations

Instances (weka.core.Instances)31 Attribute (weka.core.Attribute)12 ArrayList (java.util.ArrayList)9 File (java.io.File)8 Instance (org.dkpro.tc.api.features.Instance)8 Test (org.junit.Test)8 MultiLabelInstances (mulan.data.MultiLabelInstances)7 IOException (java.io.IOException)5 DenseInstance (weka.core.DenseInstance)5 Instance (weka.core.Instance)5 ArffSaver (weka.core.converters.ArffSaver)5 Feature (org.dkpro.tc.api.features.Feature)4 Classifier (weka.classifiers.Classifier)3 FastVector (weka.core.FastVector)3 SparseInstance (weka.core.SparseInstance)3 HashMap (java.util.HashMap)2 Result (meka.core.Result)2 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)2 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)2 FeatureType (org.dkpro.tc.api.features.FeatureType)2