use of weka.core.Instances in project dkpro-tc by dkpro.
the class WekaUtils method removeInstanceId.
/**
* Removes the instanceId attribute, iff present
*
* @param data
* data set with or without instanceId attribute
* @param multilabel
* is multi label processing
* @return the data set without instanceId attribute
* @throws Exception
* an exception
*/
public static Instances removeInstanceId(Instances data, boolean multilabel) throws Exception {
Instances filteredData;
int classIndex = data.classIndex();
if (data.attribute(Constants.ID_FEATURE_NAME) != null) {
int instanceIdOffset = data.attribute(Constants.ID_FEATURE_NAME).index();
Remove remove = new Remove();
remove.setAttributeIndices(Integer.toString(instanceIdOffset + 1));
remove.setInvertSelection(false);
remove.setInputFormat(data);
filteredData = Filter.useFilter(data, remove);
} else {
filteredData = new Instances(data);
}
// make sure the class index gets retained in multi-label
if (multilabel) {
filteredData.setClassIndex(classIndex);
}
return filteredData;
}
use of weka.core.Instances in project dkpro-tc by dkpro.
the class WekaUtils method addInstanceId.
/**
* Copies the instanceId attribute and its values from an existing data set, iff present. It
* will be indexed right before the class attribute
*
* @param newData
* data set without instanceId attribute
* @param oldData
* data set with or without instanceId attribute
* @param isMultilabel
* is multi label processing
* @return a data set with or without instanceId attribute
* @throws Exception
* an exception
*/
public static Instances addInstanceId(Instances newData, Instances oldData, boolean isMultilabel) throws Exception {
Instances filteredData;
if (oldData.attribute(Constants.ID_FEATURE_NAME) != null) {
int instanceIdOffset = oldData.attribute(Constants.ID_FEATURE_NAME).index();
Add add = new Add();
add.setAttributeName(Constants.ID_FEATURE_NAME);
// for single-label
if (isMultilabel) {
add.setAttributeIndex("last");
} else {
add.setAttributeIndex("first");
}
add.setAttributeType(new SelectedTag(Attribute.STRING, Add.TAGS_TYPE));
add.setInputFormat(newData);
filteredData = Filter.useFilter(newData, add);
int j = isMultilabel ? filteredData.numAttributes() - 1 : 0;
for (int i = 0; i < filteredData.numInstances(); i++) {
String outcomeId = oldData.instance(i).stringValue(instanceIdOffset);
filteredData.instance(i).setValue(j, outcomeId);
}
} else {
filteredData = new Instances(newData);
}
return filteredData;
}
use of weka.core.Instances in project dkpro-tc by dkpro.
the class MekaDataWriter method initalConfiguration.
private Instances initalConfiguration(Collection<Instance> instances) throws TextClassificationException, IOException {
if (saver != null) {
return masterInstance;
}
saver = new ArffSaver();
saver.setRetrieval(Saver.INCREMENTAL);
saver.setFile(arffTarget);
saver.setCompressOutput(false);
attributeStore = new AttributeStore();
List<String> lines = FileUtils.readLines(new File(outputFolder, Constants.FILENAME_FEATURES_DESCRIPTION), "utf-8");
for (String l : lines) {
String[] split = l.split("\t");
String featureName = split[0];
if (!attributeStore.containsAttributeName(featureName)) {
FeatureType type = FeatureType.valueOf(split[1]);
String enumType = null;
if (type == FeatureType.NOMINAL) {
enumType = split[2];
}
Attribute attribute = WekaFeatureEncoder.featureToAttributeUsingFeatureDescription(featureName, type, enumType);
attributeStore.addAttribute(featureName, attribute);
}
}
// Make sure "outcome" is not the name of an attribute
List<String> outcomeList = Arrays.asList(outcomes);
outcomeAttributes = createOutcomeAttributes(outcomeList);
// in Meka, class label attributes have to go on top
for (Attribute attribute : outcomeAttributes) {
attributeStore.addAttributeAtBegin(attribute.name(), attribute);
}
// for Meka-internal use
masterInstance = new Instances(WekaUtils.RELATION_NAME + ": -C " + outcomeAttributes.size() + " ", attributeStore.getAttributes(), instances.size());
masterInstance.setClassIndex(outcomeAttributes.size());
saver.setInstances(masterInstance);
return masterInstance;
}
use of weka.core.Instances in project dkpro-tc by dkpro.
the class MekaDataWriter method writeClassifierFormat.
@Override
public void writeClassifierFormat(Collection<Instance> instances) throws AnalysisEngineProcessException {
try {
Instances masterInstance = initalConfiguration(instances);
for (Instance instance : instances) {
double[] featureValues = getFeatureValues(attributeStore, instance);
// set class label values
List<String> instanceOutcome = instance.getOutcomes();
for (Attribute label : outcomeAttributes) {
String labelname = label.name();
featureValues[attributeStore.getAttributeOffset(labelname)] = instanceOutcome.contains(labelname.split(CLASS_ATTRIBUTE_PREFIX)[1]) ? 1.0d : 0.0d;
}
weka.core.Instance wekaInstance;
if (useSparse) {
wekaInstance = new SparseInstance(1.0, featureValues);
} else {
wekaInstance = new DenseInstance(1.0, featureValues);
}
wekaInstance.setDataset(masterInstance);
Double instanceWeight = instance.getWeight();
if (applyWeighting) {
wekaInstance.setWeight(instanceWeight);
}
saver.writeIncremental(wekaInstance);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
use of weka.core.Instances in project dkpro-tc by dkpro.
the class WekaDataWriter method initalConfiguration.
private Instances initalConfiguration(Collection<Instance> instances) throws TextClassificationException, IOException {
if (saver != null) {
return masterInstance;
}
saver = new ArffSaver();
saver.setRetrieval(Saver.INCREMENTAL);
saver.setFile(arffTarget);
saver.setCompressOutput(false);
attributeStore = new AttributeStore();
List<String> lines = FileUtils.readLines(new File(outputFolder, Constants.FILENAME_FEATURES_DESCRIPTION), "utf-8");
for (String l : lines) {
String[] split = l.split("\t");
String featureName = split[0];
if (!attributeStore.containsAttributeName(featureName)) {
FeatureType type = FeatureType.valueOf(split[1]);
String enumType = null;
if (type == FeatureType.NOMINAL) {
enumType = split[2];
}
Attribute attribute = WekaFeatureEncoder.featureToAttributeUsingFeatureDescription(featureName, type, enumType);
attributeStore.addAttribute(featureName, attribute);
}
}
// Make sure "outcome" is not the name of an attribute
List<String> outcomeList = Arrays.asList(outcomes);
outcomeAttribute = createOutcomeAttribute(outcomeList, isRegression);
if (attributeStore.containsAttributeName(CLASS_ATTRIBUTE_NAME)) {
System.err.println("A feature with name \"outcome\" was found. Renaming outcome attribute");
outcomeAttribute = outcomeAttribute.copy(CLASS_ATTRIBUTE_PREFIX + CLASS_ATTRIBUTE_NAME);
}
attributeStore.addAttribute(outcomeAttribute.name(), outcomeAttribute);
masterInstance = new Instances(WekaUtils.RELATION_NAME, attributeStore.getAttributes(), instances.size());
masterInstance.setClass(outcomeAttribute);
saver.setInstances(masterInstance);
return masterInstance;
}
Aggregations