use of weka.core.DenseInstance in project dkpro-tc by dkpro.
the class WekaUtils method instanceListToArffFile.
/**
* Converts a feature store to a list of instances. Single-label case.
*
* @param outputFile
* the output file
* @param instanceList
* the instance list
* @param useDenseInstances
* use dense instances
* @param isRegressionExperiment
* is regression
* @param useWeights
* uses weight
* @throws Exception
* in case of error
*/
public static void instanceListToArffFile(File outputFile, List<Instance> instanceList, boolean useDenseInstances, boolean isRegressionExperiment, boolean useWeights) throws Exception {
List<String> outcomeList = new ArrayList<>();
for (Instance i : instanceList) {
outcomeList.add(i.getOutcome());
}
// check for error conditions
if (outcomeList.isEmpty()) {
throw new IllegalArgumentException("List of instance outcomes is empty.");
}
// Filter preprocessingFilter = new ReplaceMissingValuesWithZeroFilter();
AttributeStore attributeStore = WekaFeatureEncoder.getAttributeStore(instanceList);
// Make sure "outcome" is not the name of an attribute
Attribute outcomeAttribute = createOutcomeAttribute(outcomeList, isRegressionExperiment);
if (attributeStore.containsAttributeName(CLASS_ATTRIBUTE_NAME)) {
System.err.println("A feature with name \"outcome\" was found. Renaming outcome attribute");
outcomeAttribute = outcomeAttribute.copy(CLASS_ATTRIBUTE_PREFIX + CLASS_ATTRIBUTE_NAME);
}
attributeStore.addAttribute(outcomeAttribute.name(), outcomeAttribute);
Instances wekaInstances = new Instances(RELATION_NAME, attributeStore.getAttributes(), instanceList.size());
wekaInstances.setClass(outcomeAttribute);
if (!outputFile.exists()) {
outputFile.mkdirs();
outputFile.createNewFile();
}
ArffSaver saver = new ArffSaver();
// preprocessingFilter.setInputFormat(wekaInstances);
saver.setRetrieval(Saver.INCREMENTAL);
saver.setFile(outputFile);
saver.setCompressOutput(true);
saver.setInstances(wekaInstances);
for (int i = 0; i < instanceList.size(); i++) {
Instance instance = instanceList.get(i);
double[] featureValues = getFeatureValues(attributeStore, instance);
weka.core.Instance wekaInstance;
if (useDenseInstances) {
wekaInstance = new DenseInstance(1.0, featureValues);
} else {
wekaInstance = new SparseInstance(1.0, featureValues);
}
wekaInstance.setDataset(wekaInstances);
String outcome = outcomeList.get(i);
if (isRegressionExperiment) {
wekaInstance.setClassValue(Double.parseDouble(outcome));
} else {
wekaInstance.setClassValue(outcome);
}
Double instanceWeight = instance.getWeight();
if (useWeights) {
wekaInstance.setWeight(instanceWeight);
}
// preprocessingFilter.input(wekaInstance);
// saver.writeIncremental(preprocessingFilter.output());
saver.writeIncremental(wekaInstance);
}
// finishes the incremental saving process
saver.writeIncremental(null);
}
use of weka.core.DenseInstance in project dkpro-tc by dkpro.
the class ReplaceMissingValuesWithZeroFilter method convertInstance.
/**
* Convert a single instance over. The converted instance is added to the end of the output
* queue.
*
* @param instance
* the instance to convert
*/
private void convertInstance(Instance instance) {
Instance inst = null;
if (instance instanceof SparseInstance) {
double[] vals = new double[instance.numValues()];
int[] indices = new int[instance.numValues()];
int num = 0;
for (int j = 0; j < instance.numValues(); j++) {
if (instance.isMissingSparse(j) && (getInputFormat().classIndex() != instance.index(j)) && (instance.attributeSparse(j).isNominal() || instance.attributeSparse(j).isNumeric())) {
} else {
vals[num] = instance.valueSparse(j);
indices[num] = instance.index(j);
num++;
}
}
if (num == instance.numValues()) {
inst = new SparseInstance(instance.weight(), vals, indices, instance.numAttributes());
} else {
double[] tempVals = new double[num];
int[] tempInd = new int[num];
System.arraycopy(vals, 0, tempVals, 0, num);
System.arraycopy(indices, 0, tempInd, 0, num);
inst = new SparseInstance(instance.weight(), tempVals, tempInd, instance.numAttributes());
}
} else {
double[] vals = new double[getInputFormat().numAttributes()];
for (int j = 0; j < instance.numAttributes(); j++) {
if (instance.isMissing(j) && (getInputFormat().classIndex() != j) && (getInputFormat().attribute(j).isNominal() || getInputFormat().attribute(j).isNumeric())) {
vals[j] = 0.0d;
} else {
vals[j] = instance.value(j);
}
}
inst = new DenseInstance(instance.weight(), vals);
}
inst.setDataset(instance.dataset());
push(inst);
}
use of weka.core.DenseInstance in project TrakEM2 by trakem2.
the class LineageClassifier method classify.
public static final boolean classify(final double[] vector) throws Exception {
// Obtain or generate a Thread-local instance
Operator op;
synchronized (table) {
// avoid clashes within weka
final Thread t = Thread.currentThread();
op = table.get(t);
if (null == op) {
op = new Operator();
table.put(t, op);
}
}
// Future weka versions will use new DenseInstance(1, vector) instead
final Instance ins = new DenseInstance(1, vector);
ins.setDataset(op.data);
// Was trained to return true or false, represented in weka as 0 or 1
return 1 == ((int) Math.round(op.c.classifyInstance(ins)));
}
use of weka.core.DenseInstance in project cia by Hack23.
the class WordCounterImpl method calculateWordCount.
@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) {
final String html = documentContentData.getContent();
final Attribute input = new Attribute(HTML, (ArrayList<String>) null);
final ArrayList<Attribute> inputVec = new ArrayList<>();
inputVec.add(input);
final Instances htmlInst = new Instances(HTML, inputVec, 1);
htmlInst.add(new DenseInstance(1));
htmlInst.instance(0).setValue(0, html);
final StopwordsHandler stopwordsHandler = new StopwordsHandler() {
@Override
public boolean isStopword(final String word) {
return word.length() < 5;
}
};
final NGramTokenizer tokenizer = new NGramTokenizer();
tokenizer.setNGramMinSize(1);
tokenizer.setNGramMaxSize(1);
tokenizer.setDelimiters(TOKEN_DELIMITERS);
final StringToWordVector filter = new StringToWordVector();
filter.setTokenizer(tokenizer);
filter.setStopwordsHandler(stopwordsHandler);
filter.setLowerCaseTokens(true);
filter.setOutputWordCounts(true);
filter.setWordsToKeep(maxResult);
final Map<String, Integer> result = new HashMap<>();
try {
filter.setInputFormat(htmlInst);
final Instances dataFiltered = Filter.useFilter(htmlInst, filter);
final Instance last = dataFiltered.lastInstance();
final int numAttributes = last.numAttributes();
for (int i = 0; i < numAttributes; i++) {
result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
}
} catch (final Exception e) {
LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e);
}
return result;
}
use of weka.core.DenseInstance in project dkpro-tc by dkpro.
the class WekaUtils method instanceListToArffFileMultiLabel.
/**
* Converts a feature store to a list of instances. Multi-label case.
*
* @param outputFile
* the output file
* @param instances
* the instances to convert
* @param useDenseInstances
* dense features
* @param useWeights
* use weights
* @throws Exception
* in case of errors
*/
public static void instanceListToArffFileMultiLabel(File outputFile, List<Instance> instances, boolean useDenseInstances, boolean useWeights) throws Exception {
// Filter preprocessingFilter = new ReplaceMissingValuesWithZeroFilter();
AttributeStore attributeStore = WekaFeatureEncoder.getAttributeStore(instances);
List<String> outcomes = new ArrayList<>();
for (Instance i : instances) {
outcomes.add(i.getOutcome());
}
List<Attribute> outcomeAttributes = createOutcomeAttributes(new ArrayList<String>(outcomes));
// in Meka, class label attributes have to go on top
for (Attribute attribute : outcomeAttributes) {
attributeStore.addAttributeAtBegin(attribute.name(), attribute);
}
// for Meka-internal use
Instances wekaInstances = new Instances(RELATION_NAME + ": -C " + outcomeAttributes.size() + " ", attributeStore.getAttributes(), instances.size());
wekaInstances.setClassIndex(outcomeAttributes.size());
if (!outputFile.exists()) {
outputFile.mkdirs();
outputFile.createNewFile();
}
ArffSaver saver = new ArffSaver();
// preprocessingFilter.setInputFormat(wekaInstances);
saver.setRetrieval(Saver.INCREMENTAL);
saver.setFile(outputFile);
saver.setCompressOutput(true);
saver.setInstances(wekaInstances);
for (int i = 0; i < instances.size(); i++) {
Instance instance = instances.get(i);
double[] featureValues = getFeatureValues(attributeStore, instance);
// set class label values
List<String> instanceOutcome = instance.getOutcomes();
for (Attribute label : outcomeAttributes) {
String labelname = label.name();
featureValues[attributeStore.getAttributeOffset(labelname)] = instanceOutcome.contains(labelname.split(CLASS_ATTRIBUTE_PREFIX)[1]) ? 1.0d : 0.0d;
}
weka.core.Instance wekaInstance;
if (useDenseInstances) {
wekaInstance = new DenseInstance(1.0, featureValues);
} else {
wekaInstance = new SparseInstance(1.0, featureValues);
}
wekaInstance.setDataset(wekaInstances);
Double instanceWeight = instance.getWeight();
if (useWeights) {
wekaInstance.setWeight(instanceWeight);
}
// preprocessingFilter.input(wekaInstance);
// saver.writeIncremental(preprocessingFilter.output());
saver.writeIncremental(wekaInstance);
}
// finishes the incremental saving process
saver.writeIncremental(null);
}
Aggregations