Search in sources :

Example 1 with Remove

use of weka.filters.unsupervised.attribute.Remove in project dkpro-tc by dkpro.

the class WekaUtils method makeOutcomeClassesCompatible.

/**
 * Adapts the test data class labels to the training data. Class labels from the test data
 * unseen in the training data will be deleted from the test data. Class labels from the
 * training data unseen in the test data will be added to the test data. If training and test
 * class labels are equal, nothing will be done.
 *
 * @param trainData
 *            train data
 * @param testData
 *            test data
 * @param multilabel
 *            is multilable
 * @return instance
 * @throws Exception
 *             in case of error
 */
@SuppressWarnings({ "rawtypes", "unchecked" })
public static Instances makeOutcomeClassesCompatible(Instances trainData, Instances testData, boolean multilabel) throws Exception {
    // new (compatible) test data
    Instances compTestData = null;
    // ================ SINGLE LABEL BRANCH ======================
    if (!multilabel) {
        // retrieve class labels
        Enumeration trainOutcomeValues = trainData.classAttribute().enumerateValues();
        Enumeration testOutcomeValues = testData.classAttribute().enumerateValues();
        ArrayList trainLabels = Collections.list(trainOutcomeValues);
        ArrayList testLabels = Collections.list(testOutcomeValues);
        // add new outcome class attribute to test data
        Add addFilter = new Add();
        addFilter.setNominalLabels(StringUtils.join(trainLabels, ','));
        addFilter.setAttributeName(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS);
        addFilter.setInputFormat(testData);
        testData = Filter.useFilter(testData, addFilter);
        // fill NEW test data with values from old test data plus the new class attribute
        compTestData = new Instances(testData, testData.numInstances());
        for (int i = 0; i < testData.numInstances(); i++) {
            weka.core.Instance instance = testData.instance(i);
            String label = (String) testLabels.get((int) instance.value(testData.classAttribute()));
            if (trainLabels.indexOf(label) != -1) {
                instance.setValue(testData.attribute(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS), label);
            } else {
                instance.setMissing(testData.classIndex());
            }
            compTestData.add(instance);
        }
        // remove old class attribute
        Remove remove = new Remove();
        remove.setAttributeIndices(Integer.toString(compTestData.attribute(Constants.CLASS_ATTRIBUTE_NAME).index() + 1));
        remove.setInvertSelection(false);
        remove.setInputFormat(compTestData);
        compTestData = Filter.useFilter(compTestData, remove);
        // set new class attribute
        compTestData.setClass(compTestData.attribute(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS));
    } else // ================ MULTI LABEL BRANCH ======================
    {
        int numTrainLabels = trainData.classIndex();
        int numTestLabels = testData.classIndex();
        ArrayList<String> trainLabels = getLabels(trainData);
        // ArrayList<String> testLabels = getLabels(testData);
        // add new outcome class attributes to test data
        Add filter = new Add();
        for (int i = 0; i < numTrainLabels; i++) {
            // numTestLabels +i (because index starts from 0)
            filter.setAttributeIndex(Integer.toString(numTestLabels + i + 1));
            filter.setNominalLabels("0,1");
            filter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
            filter.setInputFormat(testData);
            testData = Filter.useFilter(testData, filter);
        }
        // fill NEW test data with values from old test data plus the new class attributes
        compTestData = new Instances(testData, testData.numInstances());
        for (int i = 0; i < testData.numInstances(); i++) {
            weka.core.Instance instance = testData.instance(i);
            // fullfill with 0.
            for (int j = 0; j < numTrainLabels; j++) {
                instance.setValue(j + numTestLabels, 0.);
            }
            // fill the real values:
            for (int j = 0; j < numTestLabels; j++) {
                // part of train data: forget labels which are not part of the train data
                if (trainLabels.indexOf(instance.attribute(j).name()) != -1) {
                    // class label found in test data
                    int index = trainLabels.indexOf(instance.attribute(j).name());
                    instance.setValue(index + numTestLabels, instance.value(j));
                }
            }
            compTestData.add(instance);
        }
        // remove old class attributes
        for (int i = 0; i < numTestLabels; i++) {
            Remove remove = new Remove();
            remove.setAttributeIndices("1");
            remove.setInvertSelection(false);
            remove.setInputFormat(compTestData);
            compTestData = Filter.useFilter(compTestData, remove);
        }
        // adapt header and set new class label
        String relationTag = compTestData.relationName();
        compTestData.setRelationName(relationTag.substring(0, relationTag.indexOf("-C") + 2) + " " + numTrainLabels + " ");
        compTestData.setClassIndex(numTrainLabels);
    }
    return compTestData;
}
Also used : Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances) Add(weka.filters.unsupervised.attribute.Add) Enumeration(java.util.Enumeration) ArrayList(java.util.ArrayList) Remove(weka.filters.unsupervised.attribute.Remove)

Example 2 with Remove

use of weka.filters.unsupervised.attribute.Remove in project dkpro-tc by dkpro.

the class WekaUtils method removeInstanceId.

/**
 * Removes the instanceId attribute, iff present
 *
 * @param data
 *            data set with or without instanceId attribute
 * @param multilabel
 *            is multi label processing
 * @return the data set without instanceId attribute
 * @throws Exception
 *             an exception
 */
public static Instances removeInstanceId(Instances data, boolean multilabel) throws Exception {
    Instances filteredData;
    int classIndex = data.classIndex();
    if (data.attribute(Constants.ID_FEATURE_NAME) != null) {
        int instanceIdOffset = data.attribute(Constants.ID_FEATURE_NAME).index();
        Remove remove = new Remove();
        remove.setAttributeIndices(Integer.toString(instanceIdOffset + 1));
        remove.setInvertSelection(false);
        remove.setInputFormat(data);
        filteredData = Filter.useFilter(data, remove);
    } else {
        filteredData = new Instances(data);
    }
    // make sure the class index gets retained in multi-label
    if (multilabel) {
        filteredData.setClassIndex(classIndex);
    }
    return filteredData;
}
Also used : Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances) Remove(weka.filters.unsupervised.attribute.Remove)

Example 3 with Remove

use of weka.filters.unsupervised.attribute.Remove in project dkpro-tc by dkpro.

the class WekaUtils method featureSelectionMultilabel.

/**
 * Feature selection using Mulan.
 *
 * @param aContext
 *            Lab context
 * @param trainData
 *            training data
 * @param attributeEvaluator
 *            evaluator
 * @param labelTransformationMethod
 *            transformation method
 * @param numLabelsToKeep
 *            mapping
 * @return remove object
 * @throws TextClassificationException
 *             in case of errors
 */
public static Remove featureSelectionMultilabel(TaskContext aContext, Instances trainData, List<String> attributeEvaluator, String labelTransformationMethod, int numLabelsToKeep) throws TextClassificationException {
    // file to hold the results of attribute selection
    File fsResultsFile = getFile(aContext, TEST_TASK_OUTPUT_KEY, WekaTestTask.featureSelectionFile, AccessMode.READWRITE);
    // filter for reducing dimension of attributes
    Remove filterRemove = new Remove();
    try {
        MultiLabelInstances mulanInstances = convertMekaInstancesToMulanInstances(trainData);
        ASEvaluation eval = ASEvaluation.forName(attributeEvaluator.get(0), attributeEvaluator.subList(1, attributeEvaluator.size()).toArray(new String[0]));
        AttributeEvaluator attributeSelectionFilter;
        // is complicated due to missing commandline support of mulan):
        if (labelTransformationMethod.equals("LabelPowersetAttributeEvaluator")) {
            attributeSelectionFilter = new LabelPowersetAttributeEvaluator(eval, mulanInstances);
        } else if (labelTransformationMethod.equals("BinaryRelevanceAttributeEvaluator")) {
            attributeSelectionFilter = new BinaryRelevanceAttributeEvaluator(eval, mulanInstances, "max", "none", "rank");
        } else {
            throw new TextClassificationException("This Label Transformation Method is not supported.");
        }
        Ranker r = new Ranker();
        int[] result = r.search(attributeSelectionFilter, mulanInstances);
        // collect evaluation for *all* attributes and write to file
        StringBuffer evalFile = new StringBuffer();
        for (Attribute att : mulanInstances.getFeatureAttributes()) {
            evalFile.append(att.name() + ": " + attributeSelectionFilter.evaluateAttribute(att.index() - mulanInstances.getNumLabels()) + "\n");
        }
        FileUtils.writeStringToFile(fsResultsFile, evalFile.toString(), "utf-8");
        // create a filter to reduce the dimension of the attributes
        int[] toKeep = new int[numLabelsToKeep + mulanInstances.getNumLabels()];
        System.arraycopy(result, 0, toKeep, 0, numLabelsToKeep);
        int[] labelIndices = mulanInstances.getLabelIndices();
        System.arraycopy(labelIndices, 0, toKeep, numLabelsToKeep, mulanInstances.getNumLabels());
        filterRemove.setAttributeIndicesArray(toKeep);
        filterRemove.setInvertSelection(true);
        filterRemove.setInputFormat(mulanInstances.getDataSet());
    } catch (ArrayIndexOutOfBoundsException e) {
        // less attributes than we want => no filtering
        return null;
    } catch (Exception e) {
        throw new TextClassificationException(e);
    }
    return filterRemove;
}
Also used : TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) Attribute(weka.core.Attribute) Remove(weka.filters.unsupervised.attribute.Remove) MultiLabelInstances(mulan.data.MultiLabelInstances) AttributeEvaluator(weka.attributeSelection.AttributeEvaluator) BinaryRelevanceAttributeEvaluator(mulan.dimensionalityReduction.BinaryRelevanceAttributeEvaluator) LabelPowersetAttributeEvaluator(mulan.dimensionalityReduction.LabelPowersetAttributeEvaluator) Ranker(mulan.dimensionalityReduction.Ranker) FileNotFoundException(java.io.FileNotFoundException) InvalidDataFormatException(mulan.data.InvalidDataFormatException) IOException(java.io.IOException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) ASEvaluation(weka.attributeSelection.ASEvaluation) BinaryRelevanceAttributeEvaluator(mulan.dimensionalityReduction.BinaryRelevanceAttributeEvaluator) LabelPowersetAttributeEvaluator(mulan.dimensionalityReduction.LabelPowersetAttributeEvaluator) File(java.io.File)

Example 4 with Remove

use of weka.filters.unsupervised.attribute.Remove in project dkpro-tc by dkpro.

the class WekaTestTask method execute.

@Override
public void execute(TaskContext aContext) throws Exception {
    boolean multiLabel = learningMode.equals(Constants.LM_MULTI_LABEL);
    File arffFileTrain = WekaUtils.getFile(aContext, TEST_TASK_INPUT_KEY_TRAINING_DATA, Constants.FILENAME_DATA_IN_CLASSIFIER_FORMAT, AccessMode.READONLY);
    File arffFileTest = WekaUtils.getFile(aContext, TEST_TASK_INPUT_KEY_TEST_DATA, Constants.FILENAME_DATA_IN_CLASSIFIER_FORMAT, AccessMode.READONLY);
    Instances trainData = WekaUtils.getInstances(arffFileTrain, multiLabel);
    Instances testData = WekaUtils.getInstances(arffFileTest, multiLabel);
    // do not balance in regression experiments
    if (!learningMode.equals(Constants.LM_REGRESSION)) {
        testData = WekaUtils.makeOutcomeClassesCompatible(trainData, testData, multiLabel);
    }
    Instances copyTestData = new Instances(testData);
    trainData = WekaUtils.removeInstanceId(trainData, multiLabel);
    testData = WekaUtils.removeInstanceId(testData, multiLabel);
    // FEATURE SELECTION
    if (!learningMode.equals(Constants.LM_MULTI_LABEL)) {
        if (featureSearcher != null && attributeEvaluator != null) {
            AttributeSelection attSel = WekaUtils.featureSelectionSinglelabel(aContext, trainData, featureSearcher, attributeEvaluator);
            File file = WekaUtils.getFile(aContext, "", WekaTestTask.featureSelectionFile, AccessMode.READWRITE);
            FileUtils.writeStringToFile(file, attSel.toResultsString(), "utf-8");
            if (applySelection) {
                Logger.getLogger(getClass()).info("APPLYING FEATURE SELECTION");
                trainData = attSel.reduceDimensionality(trainData);
                testData = attSel.reduceDimensionality(testData);
            }
        }
    } else {
        if (attributeEvaluator != null && labelTransformationMethod != null && numLabelsToKeep > 0) {
            Remove attSel = WekaUtils.featureSelectionMultilabel(aContext, trainData, attributeEvaluator, labelTransformationMethod, numLabelsToKeep);
            if (applySelection) {
                Logger.getLogger(getClass()).info("APPLYING FEATURE SELECTION");
                trainData = WekaUtils.applyAttributeSelectionFilter(trainData, attSel);
                testData = WekaUtils.applyAttributeSelectionFilter(testData, attSel);
            }
        }
    }
    // build classifier
    Classifier cl = WekaUtils.getClassifier(learningMode, classificationArguments);
    // file to hold prediction results
    File evalOutput = WekaUtils.getFile(aContext, "", evaluationBin, AccessMode.READWRITE);
    // evaluation & prediction generation
    if (multiLabel) {
        // we don't need to build the classifier - meka does this
        // internally
        Result r = WekaUtils.getEvaluationMultilabel(cl, trainData, testData, threshold);
        WekaUtils.writeMlResultToFile(new MultilabelResult(r.allTrueValues(), r.allPredictions(), threshold), evalOutput);
        testData = WekaUtils.getPredictionInstancesMultiLabel(testData, cl, WekaUtils.getMekaThreshold(threshold, r, trainData));
        testData = WekaUtils.addInstanceId(testData, copyTestData, true);
    } else {
        // train the classifier on the train set split - not necessary in multilabel setup, but
        // in single label setup
        cl.buildClassifier(trainData);
        weka.core.SerializationHelper.write(evalOutput.getAbsolutePath(), WekaUtils.getEvaluationSinglelabel(cl, trainData, testData));
        testData = WekaUtils.getPredictionInstancesSingleLabel(testData, cl);
        testData = WekaUtils.addInstanceId(testData, copyTestData, false);
    }
    // Write out the prediction - the data sink expects an .arff ending file so we game it a bit
    // and rename the file afterwards to .txt
    File predictionFile = WekaUtils.getFile(aContext, "", Constants.FILENAME_PREDICTIONS, AccessMode.READWRITE);
    File arffDummy = new File(predictionFile.getParent(), "prediction.arff");
    DataSink.write(arffDummy.getAbsolutePath(), testData);
    FileUtils.moveFile(arffDummy, predictionFile);
}
Also used : Instances(weka.core.Instances) AttributeSelection(weka.attributeSelection.AttributeSelection) MultilabelResult(org.dkpro.tc.ml.weka.util.MultilabelResult) Remove(weka.filters.unsupervised.attribute.Remove) Classifier(weka.classifiers.Classifier) File(java.io.File) MultilabelResult(org.dkpro.tc.ml.weka.util.MultilabelResult) Result(meka.core.Result)

Aggregations

Remove (weka.filters.unsupervised.attribute.Remove)4 MultiLabelInstances (mulan.data.MultiLabelInstances)3 Instances (weka.core.Instances)3 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Enumeration (java.util.Enumeration)1 Result (meka.core.Result)1 InvalidDataFormatException (mulan.data.InvalidDataFormatException)1 BinaryRelevanceAttributeEvaluator (mulan.dimensionalityReduction.BinaryRelevanceAttributeEvaluator)1 LabelPowersetAttributeEvaluator (mulan.dimensionalityReduction.LabelPowersetAttributeEvaluator)1 Ranker (mulan.dimensionalityReduction.Ranker)1 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)1 MultilabelResult (org.dkpro.tc.ml.weka.util.MultilabelResult)1 ASEvaluation (weka.attributeSelection.ASEvaluation)1 AttributeEvaluator (weka.attributeSelection.AttributeEvaluator)1 AttributeSelection (weka.attributeSelection.AttributeSelection)1 Classifier (weka.classifiers.Classifier)1 Attribute (weka.core.Attribute)1