Search in sources :

Example 11 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtilTest method tcInstanceToWekaInstanceTest.

@Test
public void tcInstanceToWekaInstanceTest() throws Exception {
    List<String> outcomeValues = Arrays.asList(new String[] { "outc_1", "outc_2", "outc_3" });
    Instance i1 = new Instance();
    i1.addFeature(new Feature("feature1", 2, FeatureType.NUMERIC));
    i1.addFeature(new Feature("feature2", 2, FeatureType.NUMERIC));
    i1.addFeature(new Feature("feature3_{{", "a", FeatureType.STRING));
    Instance i2 = new Instance();
    i2.addFeature(new Feature("feature1", 1, FeatureType.NUMERIC));
    i2.addFeature(new Feature("feature4", "val_1", FeatureType.STRING));
    i2.addFeature(new Feature("feature3_{{", "b", FeatureType.STRING));
    ArrayList<Attribute> attributes = new ArrayList<Attribute>();
    attributes.add(new Attribute("feature5"));
    attributes.add(new Attribute("feature2"));
    attributes.add(new Attribute("feature4", Arrays.asList(new String[] { "val_1", "val_2" })));
    attributes.add(new Attribute("feature1"));
    attributes.add(new Attribute("outcome", outcomeValues));
    Instances trainingData = new Instances("test", attributes, 0);
    weka.core.Instance wekaInstance1 = WekaUtils.tcInstanceToWekaInstance(i1, trainingData, outcomeValues, false);
    weka.core.Instance wekaInstance2 = WekaUtils.tcInstanceToWekaInstance(i2, trainingData, outcomeValues, false);
    assertEquals(true, wekaInstance1.equalHeaders(wekaInstance2));
    assertEquals(5, wekaInstance1.numAttributes());
    wekaInstance1.dataset().add(wekaInstance1);
    wekaInstance2.dataset().add(wekaInstance2);
    System.out.println(wekaInstance1.dataset() + "\n");
    System.out.println(wekaInstance2.dataset() + "\n");
}
Also used : Instances(weka.core.Instances) Instance(org.dkpro.tc.api.features.Instance) Attribute(weka.core.Attribute) ArrayList(java.util.ArrayList) Feature(org.dkpro.tc.api.features.Feature) Test(org.junit.Test)

Example 12 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtilTest method tcInstanceToWekaInstanceFailTest.

@Test(expected = IllegalArgumentException.class)
public void tcInstanceToWekaInstanceFailTest() throws Exception {
    List<String> outcomeValues = Arrays.asList(new String[] { "outc_1", "outc_2", "outc_3" });
    Instance i1 = new Instance();
    i1.addFeature(new Feature("feature1", 2, FeatureType.NUMERIC));
    i1.addFeature(new Feature("feature4", "val_1", FeatureType.STRING));
    i1.addFeature(new Feature("feature3_{{", "a", FeatureType.STRING));
    ArrayList<Attribute> attributes = new ArrayList<Attribute>();
    attributes.add(new Attribute("feature2"));
    attributes.add(new Attribute("feature4", Arrays.asList(new String[] { "val_4", "val_2" })));
    attributes.add(new Attribute("outcome", outcomeValues));
    Instances trainingData = new Instances("test", attributes, 0);
    @SuppressWarnings("unused") weka.core.Instance wekaInstance1 = WekaUtils.tcInstanceToWekaInstance(i1, trainingData, outcomeValues, false);
}
Also used : Instances(weka.core.Instances) Instance(org.dkpro.tc.api.features.Instance) Attribute(weka.core.Attribute) ArrayList(java.util.ArrayList) Feature(org.dkpro.tc.api.features.Feature) Test(org.junit.Test)

Example 13 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaOutcomeIDReport method execute.

@Override
public void execute() throws Exception {
    init();
    File arff = WekaUtils.getFile(getContext(), "", FILENAME_PREDICTIONS, AccessMode.READONLY);
    mlResults = WekaUtils.getFile(getContext(), "", WekaTestTask.evaluationBin, AccessMode.READONLY);
    Instances predictions = WekaUtils.getInstances(arff, isMultiLabel);
    List<String> labels = getLabels(isMultiLabel, isRegression);
    Properties props;
    if (isMultiLabel) {
        MultilabelResult r = WekaUtils.readMlResultFromFile(mlResults);
        props = generateMlProperties(predictions, labels, r);
    } else {
        Map<Integer, String> documentIdMap = loadDocumentMap();
        props = generateSlProperties(predictions, isRegression, isUnit, documentIdMap, labels);
    }
    FileWriterWithEncoding fw = null;
    try {
        fw = new FileWriterWithEncoding(getTargetOutputFile(), "utf-8");
        props.store(fw, generateHeader(labels));
    } finally {
        IOUtils.closeQuietly(fw);
    }
}
Also used : Instances(weka.core.Instances) FileWriterWithEncoding(org.apache.commons.io.output.FileWriterWithEncoding) MultilabelResult(org.dkpro.tc.ml.weka.util.MultilabelResult) Properties(java.util.Properties) SortedKeyProperties(org.dkpro.tc.ml.report.util.SortedKeyProperties) File(java.io.File)

Example 14 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaSerliazeModelConnector method writeWekaSpecificInformation.

private void writeWekaSpecificInformation(TaskContext aContext) throws Exception {
    boolean isMultiLabel = learningMode.equals(Constants.LM_MULTI_LABEL);
    boolean isRegression = learningMode.equals(Constants.LM_REGRESSION);
    File arffFileTrain = new File(aContext.getFolder(TEST_TASK_INPUT_KEY_TRAINING_DATA, AccessMode.READONLY).getPath() + "/" + Constants.FILENAME_DATA_IN_CLASSIFIER_FORMAT);
    Instances trainData = WekaUtils.getInstances(arffFileTrain, isMultiLabel);
    trainData = WekaUtils.removeInstanceId(trainData, isMultiLabel);
    // FEATURE SELECTION
    if (!isMultiLabel) {
        if (featureSearcher != null && attributeEvaluator != null) {
            // Logger.getLogger(getClass()).info("APPLYING FEATURE SELECTION");
            throw new Exception("Feature Selection is currently not supported in Save Model mode.");
        }
    } else {
        if (attributeEvaluator != null && labelTransformationMethod != null && numLabelsToKeep > 0) {
            // Logger.getLogger(getClass()).info("APPLYING FEATURE SELECTION");
            throw new Exception("Feature Selection is currently not supported in Save Model mode.");
        }
    }
    // write training data header
    ObjectOutputStream outT = new ObjectOutputStream(new FileOutputStream(new File(outputFolder, "training_data")));
    Instances emptyTrainCopy = new Instances(trainData);
    emptyTrainCopy.delete();
    outT.writeObject(emptyTrainCopy);
    outT.close();
    // write model file
    Classifier cl = WekaUtils.getClassifier(learningMode, classificationArguments);
    cl.buildClassifier(trainData);
    File model = new File(outputFolder, MODEL_CLASSIFIER);
    mkdir(model.getParentFile());
    weka.core.SerializationHelper.write(model.getAbsolutePath(), cl);
    // write class labels file
    List<String> classLabels;
    if (!isRegression) {
        classLabels = WekaUtils.getClassLabels(trainData, isMultiLabel);
        String classLabelsString = StringUtils.join(classLabels, "\n");
        FileUtils.writeStringToFile(new File(outputFolder, MODEL_CLASS_LABELS), classLabelsString, "utf-8");
    }
}
Also used : Instances(weka.core.Instances) FileOutputStream(java.io.FileOutputStream) Classifier(weka.classifiers.Classifier) ObjectOutputStream(java.io.ObjectOutputStream) File(java.io.File) IOException(java.io.IOException)

Example 15 with Instances

use of weka.core.Instances in project dkpro-tc by dkpro.

the class WekaUtils method instanceListToArffFileMultiLabel.

/**
 * Converts a feature store to a list of instances. Multi-label case.
 *
 * @param outputFile
 *            the output file
 * @param instances
 *            the instances to convert
 * @param useDenseInstances
 *            dense features
 * @param useWeights
 *            use weights
 * @throws Exception
 *             in case of errors
 */
public static void instanceListToArffFileMultiLabel(File outputFile, List<Instance> instances, boolean useDenseInstances, boolean useWeights) throws Exception {
    // Filter preprocessingFilter = new ReplaceMissingValuesWithZeroFilter();
    AttributeStore attributeStore = WekaFeatureEncoder.getAttributeStore(instances);
    List<String> outcomes = new ArrayList<>();
    for (Instance i : instances) {
        outcomes.add(i.getOutcome());
    }
    List<Attribute> outcomeAttributes = createOutcomeAttributes(new ArrayList<String>(outcomes));
    // in Meka, class label attributes have to go on top
    for (Attribute attribute : outcomeAttributes) {
        attributeStore.addAttributeAtBegin(attribute.name(), attribute);
    }
    // for Meka-internal use
    Instances wekaInstances = new Instances(RELATION_NAME + ": -C " + outcomeAttributes.size() + " ", attributeStore.getAttributes(), instances.size());
    wekaInstances.setClassIndex(outcomeAttributes.size());
    if (!outputFile.exists()) {
        outputFile.mkdirs();
        outputFile.createNewFile();
    }
    ArffSaver saver = new ArffSaver();
    // preprocessingFilter.setInputFormat(wekaInstances);
    saver.setRetrieval(Saver.INCREMENTAL);
    saver.setFile(outputFile);
    saver.setCompressOutput(true);
    saver.setInstances(wekaInstances);
    for (int i = 0; i < instances.size(); i++) {
        Instance instance = instances.get(i);
        double[] featureValues = getFeatureValues(attributeStore, instance);
        // set class label values
        List<String> instanceOutcome = instance.getOutcomes();
        for (Attribute label : outcomeAttributes) {
            String labelname = label.name();
            featureValues[attributeStore.getAttributeOffset(labelname)] = instanceOutcome.contains(labelname.split(CLASS_ATTRIBUTE_PREFIX)[1]) ? 1.0d : 0.0d;
        }
        weka.core.Instance wekaInstance;
        if (useDenseInstances) {
            wekaInstance = new DenseInstance(1.0, featureValues);
        } else {
            wekaInstance = new SparseInstance(1.0, featureValues);
        }
        wekaInstance.setDataset(wekaInstances);
        Double instanceWeight = instance.getWeight();
        if (useWeights) {
            wekaInstance.setWeight(instanceWeight);
        }
        // preprocessingFilter.input(wekaInstance);
        // saver.writeIncremental(preprocessingFilter.output());
        saver.writeIncremental(wekaInstance);
    }
    // finishes the incremental saving process
    saver.writeIncremental(null);
}
Also used : DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) DenseInstance(weka.core.DenseInstance) SparseInstance(weka.core.SparseInstance) Instance(org.dkpro.tc.api.features.Instance) Attribute(weka.core.Attribute) ArrayList(java.util.ArrayList) ArffSaver(weka.core.converters.ArffSaver) Instances(weka.core.Instances) MultiLabelInstances(mulan.data.MultiLabelInstances)

Aggregations

Instances (weka.core.Instances)31 Attribute (weka.core.Attribute)12 ArrayList (java.util.ArrayList)9 File (java.io.File)8 Instance (org.dkpro.tc.api.features.Instance)8 Test (org.junit.Test)8 MultiLabelInstances (mulan.data.MultiLabelInstances)7 IOException (java.io.IOException)5 DenseInstance (weka.core.DenseInstance)5 Instance (weka.core.Instance)5 ArffSaver (weka.core.converters.ArffSaver)5 Feature (org.dkpro.tc.api.features.Feature)4 Classifier (weka.classifiers.Classifier)3 FastVector (weka.core.FastVector)3 SparseInstance (weka.core.SparseInstance)3 HashMap (java.util.HashMap)2 Result (meka.core.Result)2 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)2 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)2 FeatureType (org.dkpro.tc.api.features.FeatureType)2