use of weka.filters.unsupervised.attribute.Remove in project dkpro-tc by dkpro.
the class WekaUtils method makeOutcomeClassesCompatible.
/**
* Adapts the test data class labels to the training data. Class labels from the test data
* unseen in the training data will be deleted from the test data. Class labels from the
* training data unseen in the test data will be added to the test data. If training and test
* class labels are equal, nothing will be done.
*
* @param trainData
* train data
* @param testData
* test data
* @param multilabel
* is multilable
* @return instance
* @throws Exception
* in case of error
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public static Instances makeOutcomeClassesCompatible(Instances trainData, Instances testData, boolean multilabel) throws Exception {
// new (compatible) test data
Instances compTestData = null;
// ================ SINGLE LABEL BRANCH ======================
if (!multilabel) {
// retrieve class labels
Enumeration trainOutcomeValues = trainData.classAttribute().enumerateValues();
Enumeration testOutcomeValues = testData.classAttribute().enumerateValues();
ArrayList trainLabels = Collections.list(trainOutcomeValues);
ArrayList testLabels = Collections.list(testOutcomeValues);
// add new outcome class attribute to test data
Add addFilter = new Add();
addFilter.setNominalLabels(StringUtils.join(trainLabels, ','));
addFilter.setAttributeName(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS);
addFilter.setInputFormat(testData);
testData = Filter.useFilter(testData, addFilter);
// fill NEW test data with values from old test data plus the new class attribute
compTestData = new Instances(testData, testData.numInstances());
for (int i = 0; i < testData.numInstances(); i++) {
weka.core.Instance instance = testData.instance(i);
String label = (String) testLabels.get((int) instance.value(testData.classAttribute()));
if (trainLabels.indexOf(label) != -1) {
instance.setValue(testData.attribute(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS), label);
} else {
instance.setMissing(testData.classIndex());
}
compTestData.add(instance);
}
// remove old class attribute
Remove remove = new Remove();
remove.setAttributeIndices(Integer.toString(compTestData.attribute(Constants.CLASS_ATTRIBUTE_NAME).index() + 1));
remove.setInvertSelection(false);
remove.setInputFormat(compTestData);
compTestData = Filter.useFilter(compTestData, remove);
// set new class attribute
compTestData.setClass(compTestData.attribute(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS));
} else // ================ MULTI LABEL BRANCH ======================
{
int numTrainLabels = trainData.classIndex();
int numTestLabels = testData.classIndex();
ArrayList<String> trainLabels = getLabels(trainData);
// ArrayList<String> testLabels = getLabels(testData);
// add new outcome class attributes to test data
Add filter = new Add();
for (int i = 0; i < numTrainLabels; i++) {
// numTestLabels +i (because index starts from 0)
filter.setAttributeIndex(Integer.toString(numTestLabels + i + 1));
filter.setNominalLabels("0,1");
filter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
filter.setInputFormat(testData);
testData = Filter.useFilter(testData, filter);
}
// fill NEW test data with values from old test data plus the new class attributes
compTestData = new Instances(testData, testData.numInstances());
for (int i = 0; i < testData.numInstances(); i++) {
weka.core.Instance instance = testData.instance(i);
// fullfill with 0.
for (int j = 0; j < numTrainLabels; j++) {
instance.setValue(j + numTestLabels, 0.);
}
// fill the real values:
for (int j = 0; j < numTestLabels; j++) {
// part of train data: forget labels which are not part of the train data
if (trainLabels.indexOf(instance.attribute(j).name()) != -1) {
// class label found in test data
int index = trainLabels.indexOf(instance.attribute(j).name());
instance.setValue(index + numTestLabels, instance.value(j));
}
}
compTestData.add(instance);
}
// remove old class attributes
for (int i = 0; i < numTestLabels; i++) {
Remove remove = new Remove();
remove.setAttributeIndices("1");
remove.setInvertSelection(false);
remove.setInputFormat(compTestData);
compTestData = Filter.useFilter(compTestData, remove);
}
// adapt header and set new class label
String relationTag = compTestData.relationName();
compTestData.setRelationName(relationTag.substring(0, relationTag.indexOf("-C") + 2) + " " + numTrainLabels + " ");
compTestData.setClassIndex(numTrainLabels);
}
return compTestData;
}
use of weka.filters.unsupervised.attribute.Remove in project dkpro-tc by dkpro.
the class WekaUtils method removeInstanceId.
/**
* Removes the instanceId attribute, iff present
*
* @param data
* data set with or without instanceId attribute
* @param multilabel
* is multi label processing
* @return the data set without instanceId attribute
* @throws Exception
* an exception
*/
public static Instances removeInstanceId(Instances data, boolean multilabel) throws Exception {
Instances filteredData;
int classIndex = data.classIndex();
if (data.attribute(Constants.ID_FEATURE_NAME) != null) {
int instanceIdOffset = data.attribute(Constants.ID_FEATURE_NAME).index();
Remove remove = new Remove();
remove.setAttributeIndices(Integer.toString(instanceIdOffset + 1));
remove.setInvertSelection(false);
remove.setInputFormat(data);
filteredData = Filter.useFilter(data, remove);
} else {
filteredData = new Instances(data);
}
// make sure the class index gets retained in multi-label
if (multilabel) {
filteredData.setClassIndex(classIndex);
}
return filteredData;
}
use of weka.filters.unsupervised.attribute.Remove in project dkpro-tc by dkpro.
the class WekaUtils method featureSelectionMultilabel.
/**
* Feature selection using Mulan.
*
* @param aContext
* Lab context
* @param trainData
* training data
* @param attributeEvaluator
* evaluator
* @param labelTransformationMethod
* transformation method
* @param numLabelsToKeep
* mapping
* @return remove object
* @throws TextClassificationException
* in case of errors
*/
public static Remove featureSelectionMultilabel(TaskContext aContext, Instances trainData, List<String> attributeEvaluator, String labelTransformationMethod, int numLabelsToKeep) throws TextClassificationException {
// file to hold the results of attribute selection
File fsResultsFile = getFile(aContext, TEST_TASK_OUTPUT_KEY, WekaTestTask.featureSelectionFile, AccessMode.READWRITE);
// filter for reducing dimension of attributes
Remove filterRemove = new Remove();
try {
MultiLabelInstances mulanInstances = convertMekaInstancesToMulanInstances(trainData);
ASEvaluation eval = ASEvaluation.forName(attributeEvaluator.get(0), attributeEvaluator.subList(1, attributeEvaluator.size()).toArray(new String[0]));
AttributeEvaluator attributeSelectionFilter;
// is complicated due to missing commandline support of mulan):
if (labelTransformationMethod.equals("LabelPowersetAttributeEvaluator")) {
attributeSelectionFilter = new LabelPowersetAttributeEvaluator(eval, mulanInstances);
} else if (labelTransformationMethod.equals("BinaryRelevanceAttributeEvaluator")) {
attributeSelectionFilter = new BinaryRelevanceAttributeEvaluator(eval, mulanInstances, "max", "none", "rank");
} else {
throw new TextClassificationException("This Label Transformation Method is not supported.");
}
Ranker r = new Ranker();
int[] result = r.search(attributeSelectionFilter, mulanInstances);
// collect evaluation for *all* attributes and write to file
StringBuffer evalFile = new StringBuffer();
for (Attribute att : mulanInstances.getFeatureAttributes()) {
evalFile.append(att.name() + ": " + attributeSelectionFilter.evaluateAttribute(att.index() - mulanInstances.getNumLabels()) + "\n");
}
FileUtils.writeStringToFile(fsResultsFile, evalFile.toString(), "utf-8");
// create a filter to reduce the dimension of the attributes
int[] toKeep = new int[numLabelsToKeep + mulanInstances.getNumLabels()];
System.arraycopy(result, 0, toKeep, 0, numLabelsToKeep);
int[] labelIndices = mulanInstances.getLabelIndices();
System.arraycopy(labelIndices, 0, toKeep, numLabelsToKeep, mulanInstances.getNumLabels());
filterRemove.setAttributeIndicesArray(toKeep);
filterRemove.setInvertSelection(true);
filterRemove.setInputFormat(mulanInstances.getDataSet());
} catch (ArrayIndexOutOfBoundsException e) {
// less attributes than we want => no filtering
return null;
} catch (Exception e) {
throw new TextClassificationException(e);
}
return filterRemove;
}
use of weka.filters.unsupervised.attribute.Remove in project dkpro-tc by dkpro.
the class WekaTestTask method execute.
@Override
public void execute(TaskContext aContext) throws Exception {
boolean multiLabel = learningMode.equals(Constants.LM_MULTI_LABEL);
File arffFileTrain = WekaUtils.getFile(aContext, TEST_TASK_INPUT_KEY_TRAINING_DATA, Constants.FILENAME_DATA_IN_CLASSIFIER_FORMAT, AccessMode.READONLY);
File arffFileTest = WekaUtils.getFile(aContext, TEST_TASK_INPUT_KEY_TEST_DATA, Constants.FILENAME_DATA_IN_CLASSIFIER_FORMAT, AccessMode.READONLY);
Instances trainData = WekaUtils.getInstances(arffFileTrain, multiLabel);
Instances testData = WekaUtils.getInstances(arffFileTest, multiLabel);
// do not balance in regression experiments
if (!learningMode.equals(Constants.LM_REGRESSION)) {
testData = WekaUtils.makeOutcomeClassesCompatible(trainData, testData, multiLabel);
}
Instances copyTestData = new Instances(testData);
trainData = WekaUtils.removeInstanceId(trainData, multiLabel);
testData = WekaUtils.removeInstanceId(testData, multiLabel);
// FEATURE SELECTION
if (!learningMode.equals(Constants.LM_MULTI_LABEL)) {
if (featureSearcher != null && attributeEvaluator != null) {
AttributeSelection attSel = WekaUtils.featureSelectionSinglelabel(aContext, trainData, featureSearcher, attributeEvaluator);
File file = WekaUtils.getFile(aContext, "", WekaTestTask.featureSelectionFile, AccessMode.READWRITE);
FileUtils.writeStringToFile(file, attSel.toResultsString(), "utf-8");
if (applySelection) {
Logger.getLogger(getClass()).info("APPLYING FEATURE SELECTION");
trainData = attSel.reduceDimensionality(trainData);
testData = attSel.reduceDimensionality(testData);
}
}
} else {
if (attributeEvaluator != null && labelTransformationMethod != null && numLabelsToKeep > 0) {
Remove attSel = WekaUtils.featureSelectionMultilabel(aContext, trainData, attributeEvaluator, labelTransformationMethod, numLabelsToKeep);
if (applySelection) {
Logger.getLogger(getClass()).info("APPLYING FEATURE SELECTION");
trainData = WekaUtils.applyAttributeSelectionFilter(trainData, attSel);
testData = WekaUtils.applyAttributeSelectionFilter(testData, attSel);
}
}
}
// build classifier
Classifier cl = WekaUtils.getClassifier(learningMode, classificationArguments);
// file to hold prediction results
File evalOutput = WekaUtils.getFile(aContext, "", evaluationBin, AccessMode.READWRITE);
// evaluation & prediction generation
if (multiLabel) {
// we don't need to build the classifier - meka does this
// internally
Result r = WekaUtils.getEvaluationMultilabel(cl, trainData, testData, threshold);
WekaUtils.writeMlResultToFile(new MultilabelResult(r.allTrueValues(), r.allPredictions(), threshold), evalOutput);
testData = WekaUtils.getPredictionInstancesMultiLabel(testData, cl, WekaUtils.getMekaThreshold(threshold, r, trainData));
testData = WekaUtils.addInstanceId(testData, copyTestData, true);
} else {
// train the classifier on the train set split - not necessary in multilabel setup, but
// in single label setup
cl.buildClassifier(trainData);
weka.core.SerializationHelper.write(evalOutput.getAbsolutePath(), WekaUtils.getEvaluationSinglelabel(cl, trainData, testData));
testData = WekaUtils.getPredictionInstancesSingleLabel(testData, cl);
testData = WekaUtils.addInstanceId(testData, copyTestData, false);
}
// Write out the prediction - the data sink expects an .arff ending file so we game it a bit
// and rename the file afterwards to .txt
File predictionFile = WekaUtils.getFile(aContext, "", Constants.FILENAME_PREDICTIONS, AccessMode.READWRITE);
File arffDummy = new File(predictionFile.getParent(), "prediction.arff");
DataSink.write(arffDummy.getAbsolutePath(), testData);
FileUtils.moveFile(arffDummy, predictionFile);
}
Aggregations