use of weka.core.Instances in project dkpro-tc by dkpro.
the class WekaUtils method instanceListToArffFile.
/**
* Converts a feature store to a list of instances. Single-label case.
*
* @param outputFile
* the output file
* @param instanceList
* the instance list
* @param useDenseInstances
* use dense instances
* @param isRegressionExperiment
* is regression
* @param useWeights
* uses weight
* @throws Exception
* in case of error
*/
public static void instanceListToArffFile(File outputFile, List<Instance> instanceList, boolean useDenseInstances, boolean isRegressionExperiment, boolean useWeights) throws Exception {
List<String> outcomeList = new ArrayList<>();
for (Instance i : instanceList) {
outcomeList.add(i.getOutcome());
}
// check for error conditions
if (outcomeList.isEmpty()) {
throw new IllegalArgumentException("List of instance outcomes is empty.");
}
// Filter preprocessingFilter = new ReplaceMissingValuesWithZeroFilter();
AttributeStore attributeStore = WekaFeatureEncoder.getAttributeStore(instanceList);
// Make sure "outcome" is not the name of an attribute
Attribute outcomeAttribute = createOutcomeAttribute(outcomeList, isRegressionExperiment);
if (attributeStore.containsAttributeName(CLASS_ATTRIBUTE_NAME)) {
System.err.println("A feature with name \"outcome\" was found. Renaming outcome attribute");
outcomeAttribute = outcomeAttribute.copy(CLASS_ATTRIBUTE_PREFIX + CLASS_ATTRIBUTE_NAME);
}
attributeStore.addAttribute(outcomeAttribute.name(), outcomeAttribute);
Instances wekaInstances = new Instances(RELATION_NAME, attributeStore.getAttributes(), instanceList.size());
wekaInstances.setClass(outcomeAttribute);
if (!outputFile.exists()) {
outputFile.mkdirs();
outputFile.createNewFile();
}
ArffSaver saver = new ArffSaver();
// preprocessingFilter.setInputFormat(wekaInstances);
saver.setRetrieval(Saver.INCREMENTAL);
saver.setFile(outputFile);
saver.setCompressOutput(true);
saver.setInstances(wekaInstances);
for (int i = 0; i < instanceList.size(); i++) {
Instance instance = instanceList.get(i);
double[] featureValues = getFeatureValues(attributeStore, instance);
weka.core.Instance wekaInstance;
if (useDenseInstances) {
wekaInstance = new DenseInstance(1.0, featureValues);
} else {
wekaInstance = new SparseInstance(1.0, featureValues);
}
wekaInstance.setDataset(wekaInstances);
String outcome = outcomeList.get(i);
if (isRegressionExperiment) {
wekaInstance.setClassValue(Double.parseDouble(outcome));
} else {
wekaInstance.setClassValue(outcome);
}
Double instanceWeight = instance.getWeight();
if (useWeights) {
wekaInstance.setWeight(instanceWeight);
}
// preprocessingFilter.input(wekaInstance);
// saver.writeIncremental(preprocessingFilter.output());
saver.writeIncremental(wekaInstance);
}
// finishes the incremental saving process
saver.writeIncremental(null);
}
use of weka.core.Instances in project dkpro-tc by dkpro.
the class WekaUtils method applyAttributeSelectionFilter.
/**
* Applies a filter to reduce the dimension of attributes and reorders them to be used within
* Meka
*
* @param trainData
* the train data
* @param removeFilter
* remove filter
* @return weka instances
* @throws Exception
* in case of error
*/
public static Instances applyAttributeSelectionFilter(Instances trainData, Remove removeFilter) throws Exception {
// less attributes than should be kept => ignore filter
if (removeFilter == null) {
return trainData;
}
Instances filtered = Filter.useFilter(trainData, removeFilter);
filtered.setClassIndex(trainData.classIndex());
// swap attributes to fit MEKA
MekaClassAttributes attFilter = new MekaClassAttributes();
attFilter.setAttributeIndices(filtered.numAttributes() - trainData.classIndex() + 1 + "-last");
attFilter.setInputFormat(filtered);
filtered = Filter.useFilter(filtered, attFilter);
int newClassindex = filtered.classIndex();
filtered.setRelationName(filtered.relationName().replaceAll("\\-C\\s[\\d]+", "-C " + newClassindex));
return filtered;
}
use of weka.core.Instances in project dkpro-tc by dkpro.
the class WekaUtils method getInstances.
/**
* Read instances from uncompressed or compressed arff files. Compression is determined by
* filename suffix. For bz2 files, it is expected that the first two bytes mark the compression
* types (BZ) - thus, the first bytes of the stream are skipped. <br>
* For arff files with single-label outcome, the class attribute is expected at the end of the
* attribute set. For arff files with multi-label outcome, the class attribute is expected at
* the beginning of the attribute set; additionally the number of class labels must be specified
* in the relation tag behind a "-C" argument, e.g. "-C 3".
*
* @param instancesFile
* arff File
* @param multiLabel
* whether this arff file contains single- or multi-label outcome
* @return instances with class attribute set
* @throws FileNotFoundException
* if file is not found
* @throws IOException
* if an exception occurs
*/
public static Instances getInstances(File instancesFile, boolean multiLabel) throws FileNotFoundException, IOException {
FileInputStream fis = new FileInputStream(instancesFile);
BufferedInputStream bufStr = new BufferedInputStream(fis);
InputStream underlyingStream = null;
if (instancesFile.getName().endsWith(".gz")) {
underlyingStream = new GZIPInputStream(bufStr);
} else if (instancesFile.getName().endsWith(".bz2")) {
// skip bzip2 prefix that we added manually
fis.read();
fis.read();
underlyingStream = new CBZip2InputStream(bufStr);
} else {
underlyingStream = bufStr;
}
Reader reader = new InputStreamReader(underlyingStream, "UTF-8");
Instances trainData = new Instances(reader);
if (multiLabel) {
String relationTag = trainData.relationName();
// for multi-label classification, class labels are expected at beginning of attribute
// set and their number must be specified with the -C parameter in the relation tag
Matcher m = Pattern.compile("-C\\s\\d+").matcher(relationTag);
m.find();
trainData.setClassIndex(Integer.parseInt(m.group().split("-C ")[1]));
} else {
// for single-label classification, class label expected as last attribute
trainData.setClassIndex(trainData.numAttributes() - 1);
}
reader.close();
return trainData;
}
use of weka.core.Instances in project dkpro-tc by dkpro.
the class WekaUtils method makeOutcomeClassesCompatible.
/**
* Adapts the test data class labels to the training data. Class labels from the test data
* unseen in the training data will be deleted from the test data. Class labels from the
* training data unseen in the test data will be added to the test data. If training and test
* class labels are equal, nothing will be done.
*
* @param trainData
* train data
* @param testData
* test data
* @param multilabel
* is multilable
* @return instance
* @throws Exception
* in case of error
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public static Instances makeOutcomeClassesCompatible(Instances trainData, Instances testData, boolean multilabel) throws Exception {
// new (compatible) test data
Instances compTestData = null;
// ================ SINGLE LABEL BRANCH ======================
if (!multilabel) {
// retrieve class labels
Enumeration trainOutcomeValues = trainData.classAttribute().enumerateValues();
Enumeration testOutcomeValues = testData.classAttribute().enumerateValues();
ArrayList trainLabels = Collections.list(trainOutcomeValues);
ArrayList testLabels = Collections.list(testOutcomeValues);
// add new outcome class attribute to test data
Add addFilter = new Add();
addFilter.setNominalLabels(StringUtils.join(trainLabels, ','));
addFilter.setAttributeName(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS);
addFilter.setInputFormat(testData);
testData = Filter.useFilter(testData, addFilter);
// fill NEW test data with values from old test data plus the new class attribute
compTestData = new Instances(testData, testData.numInstances());
for (int i = 0; i < testData.numInstances(); i++) {
weka.core.Instance instance = testData.instance(i);
String label = (String) testLabels.get((int) instance.value(testData.classAttribute()));
if (trainLabels.indexOf(label) != -1) {
instance.setValue(testData.attribute(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS), label);
} else {
instance.setMissing(testData.classIndex());
}
compTestData.add(instance);
}
// remove old class attribute
Remove remove = new Remove();
remove.setAttributeIndices(Integer.toString(compTestData.attribute(Constants.CLASS_ATTRIBUTE_NAME).index() + 1));
remove.setInvertSelection(false);
remove.setInputFormat(compTestData);
compTestData = Filter.useFilter(compTestData, remove);
// set new class attribute
compTestData.setClass(compTestData.attribute(Constants.CLASS_ATTRIBUTE_NAME + COMPATIBLE_OUTCOME_CLASS));
} else // ================ MULTI LABEL BRANCH ======================
{
int numTrainLabels = trainData.classIndex();
int numTestLabels = testData.classIndex();
ArrayList<String> trainLabels = getLabels(trainData);
// ArrayList<String> testLabels = getLabels(testData);
// add new outcome class attributes to test data
Add filter = new Add();
for (int i = 0; i < numTrainLabels; i++) {
// numTestLabels +i (because index starts from 0)
filter.setAttributeIndex(Integer.toString(numTestLabels + i + 1));
filter.setNominalLabels("0,1");
filter.setAttributeName(trainData.attribute(i).name() + COMPATIBLE_OUTCOME_CLASS);
filter.setInputFormat(testData);
testData = Filter.useFilter(testData, filter);
}
// fill NEW test data with values from old test data plus the new class attributes
compTestData = new Instances(testData, testData.numInstances());
for (int i = 0; i < testData.numInstances(); i++) {
weka.core.Instance instance = testData.instance(i);
// fullfill with 0.
for (int j = 0; j < numTrainLabels; j++) {
instance.setValue(j + numTestLabels, 0.);
}
// fill the real values:
for (int j = 0; j < numTestLabels; j++) {
// part of train data: forget labels which are not part of the train data
if (trainLabels.indexOf(instance.attribute(j).name()) != -1) {
// class label found in test data
int index = trainLabels.indexOf(instance.attribute(j).name());
instance.setValue(index + numTestLabels, instance.value(j));
}
}
compTestData.add(instance);
}
// remove old class attributes
for (int i = 0; i < numTestLabels; i++) {
Remove remove = new Remove();
remove.setAttributeIndices("1");
remove.setInvertSelection(false);
remove.setInputFormat(compTestData);
compTestData = Filter.useFilter(compTestData, remove);
}
// adapt header and set new class label
String relationTag = compTestData.relationName();
compTestData.setRelationName(relationTag.substring(0, relationTag.indexOf("-C") + 2) + " " + numTrainLabels + " ");
compTestData.setClassIndex(numTrainLabels);
}
return compTestData;
}
use of weka.core.Instances in project dkpro-tc by dkpro.
the class WekaResultsTest method testWekaResultsSingleLabel.
@Test
public void testWekaResultsSingleLabel() throws Exception {
SMO cl = new SMO();
Instances testData = WekaUtils.makeOutcomeClassesCompatible(singleLabelTrainData, singleLabelTestData, false);
Instances trainData = WekaUtils.removeInstanceId(singleLabelTrainData, false);
testData = WekaUtils.removeInstanceId(testData, false);
cl.buildClassifier(trainData);
Evaluation eval = WekaUtils.getEvaluationSinglelabel(cl, trainData, testData);
assertEquals(7.0, eval.correct(), 0.01);
}
Aggregations