Search in sources :

Example 1 with InstanceExtractor

use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.

the class TestTaskUtils method testInstanceMultiplicationWithoutUnitId.

@Test
public void testInstanceMultiplicationWithoutUnitId() throws Exception {
    JCas jCas = initJCas(false);
    FeatureExtractorResource_ImplBase[] featureExtractors = {};
    InstanceExtractor ie = new InstanceExtractor(Constants.FM_SEQUENCE, featureExtractors, true);
    List<Instance> multipleInstances = ie.getInstances(jCas, false);
    assertEquals(6, multipleInstances.size());
    // Sequence 1
    int idx = 0;
    assertEquals("4711_0_0", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(0, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 1;
    assertEquals("4711_0_1", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(1, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 2;
    assertEquals("4711_0_2", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(2, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
    // Sequence 2
    idx = 3;
    assertEquals("4711_1_0", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(0, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 4;
    assertEquals("4711_1_1", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(1, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 5;
    assertEquals("4711_1_2", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(2, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
Also used : Instance(org.dkpro.tc.api.features.Instance) JCas(org.apache.uima.jcas.JCas) FeatureExtractorResource_ImplBase(org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase) InstanceExtractor(org.dkpro.tc.core.task.uima.InstanceExtractor) Test(org.junit.Test)

Example 2 with InstanceExtractor

use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.

the class WekaLoadModelConnector method process.

@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
    Instance instance = null;
    try {
        InstanceExtractor extractor = new InstanceExtractor(featureMode, featureExtractors, false);
        List<Instance> instances = extractor.getInstances(jcas, useSparse);
        instance = instances.get(0);
    } catch (Exception e1) {
        throw new AnalysisEngineProcessException(e1);
    }
    boolean isMultiLabel = learningMode.equals(Constants.LM_MULTI_LABEL);
    boolean isRegression = learningMode.equals(Constants.LM_REGRESSION);
    if (!isMultiLabel) {
        // single-label
        weka.core.Instance wekaInstance = null;
        try {
            wekaInstance = WekaUtils.tcInstanceToWekaInstance(instance, trainingData, classLabels, isRegression);
        } catch (Exception e) {
            throw new AnalysisEngineProcessException(e);
        }
        Object val = null;
        try {
            if (!isRegression) {
                val = classLabels.get((int) cls.classifyInstance(wekaInstance));
            } else {
                val = cls.classifyInstance(wekaInstance);
            }
        } catch (Exception e) {
            throw new AnalysisEngineProcessException(e);
        }
        TextClassificationOutcome outcome = getOutcome(jcas);
        outcome.setOutcome(val.toString());
    } else {
        // multi-label
        weka.core.Instance mekaInstance = null;
        try {
            mekaInstance = WekaUtils.tcInstanceToMekaInstance(instance, trainingData, classLabels);
        } catch (Exception e) {
            throw new AnalysisEngineProcessException(e);
        }
        double[] vals = null;
        try {
            vals = cls.distributionForInstance(mekaInstance);
        } catch (Exception e) {
            throw new AnalysisEngineProcessException(e);
        }
        List<String> outcomes = new ArrayList<String>();
        for (int i = 0; i < vals.length; i++) {
            if (vals[i] >= Double.valueOf(bipartitionThreshold)) {
                String label = mekaInstance.attribute(i).name().split(WekaDataWriter.CLASS_ATTRIBUTE_PREFIX)[1];
                outcomes.add(label);
            }
        }
        // TextClassificationFocus focus = null;
        if (FM_DOCUMENT.equals(featureMode) || FM_PAIR.equals(featureMode)) {
            Collection<TextClassificationOutcome> oldOutcomes = JCasUtil.select(jcas, TextClassificationOutcome.class);
            List<Annotation> annotationsList = new ArrayList<Annotation>();
            for (TextClassificationOutcome oldOutcome : oldOutcomes) {
                annotationsList.add(oldOutcome);
            }
            for (Annotation annotation : annotationsList) {
                annotation.removeFromIndexes();
            }
        } else {
            TextClassificationOutcome annotation = getOutcome(jcas);
            annotation.removeFromIndexes();
        // focus = JCasUtil.selectSingle(jcas, TextClassificationFocus.class);
        }
        if (outcomes.size() > 0) {
            TextClassificationOutcome newOutcome = new TextClassificationOutcome(jcas);
            newOutcome.setOutcome(outcomes.get(0));
            newOutcome.addToIndexes();
        }
        if (outcomes.size() > 1) {
            // add more outcome annotations
            try {
                for (int i = 1; i < outcomes.size(); i++) {
                    TextClassificationOutcome newOutcome = new TextClassificationOutcome(jcas);
                    newOutcome.setOutcome(outcomes.get(i));
                    newOutcome.addToIndexes();
                }
            } catch (Exception ex) {
                String msg = "Error while trying to retrieve TC focus from CAS. Details: " + ex.getMessage();
                Logger.getLogger(getClass()).error(msg, ex);
                throw new RuntimeException(msg, ex);
            }
        }
    }
}
Also used : Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) Annotation(org.apache.uima.jcas.tcas.Annotation) TextClassificationOutcome(org.dkpro.tc.api.type.TextClassificationOutcome) InstanceExtractor(org.dkpro.tc.core.task.uima.InstanceExtractor)

Example 3 with InstanceExtractor

use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.

the class TestTaskUtils method testInstanceMultiplicationWithUnitId.

@Test
public void testInstanceMultiplicationWithUnitId() throws Exception {
    JCas jCas = initJCas(true);
    FeatureExtractorResource_ImplBase[] featureExtractors = {};
    InstanceExtractor ie = new InstanceExtractor(Constants.FM_SEQUENCE, featureExtractors, true);
    List<Instance> multipleInstances = ie.getInstances(jCas, false);
    assertEquals(6, multipleInstances.size());
    int idx = 0;
    assertEquals("4711_0_0_a", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(0, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 1;
    assertEquals("4711_0_1_car", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(1, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 2;
    assertEquals("4711_0_2_drives", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(2, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
    idx = 3;
    assertEquals("4711_1_0_the", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(0, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 4;
    assertEquals("4711_1_1_hedgehogs", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(1, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 5;
    assertEquals("4711_1_2_dies", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(2, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
Also used : Instance(org.dkpro.tc.api.features.Instance) JCas(org.apache.uima.jcas.JCas) FeatureExtractorResource_ImplBase(org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase) InstanceExtractor(org.dkpro.tc.core.task.uima.InstanceExtractor) Test(org.junit.Test)

Example 4 with InstanceExtractor

use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.

the class TestTaskUtils method testUnitModeInstanceNumbering.

// test numeration for unit mode i.e. no sequence
@Test
public void testUnitModeInstanceNumbering() throws Exception {
    JCas jCas = initJCas(true);
    FeatureExtractorResource_ImplBase[] featureExtractors = {};
    InstanceExtractor ie = new InstanceExtractor(Constants.FM_UNIT, featureExtractors, true);
    List<Instance> multipleInstances = ie.getInstances(jCas, false);
    assertEquals(6, multipleInstances.size());
    int idx = 0;
    assertEquals("4711_0_a", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(0, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 1;
    assertEquals("4711_1_car", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(1, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 2;
    assertEquals("4711_2_drives", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(2, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
    idx = 3;
    assertEquals("4711_3_the", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(3, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 4;
    assertEquals("4711_4_hedgehogs", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(4, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 5;
    assertEquals("4711_5_dies", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(5, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
Also used : Instance(org.dkpro.tc.api.features.Instance) JCas(org.apache.uima.jcas.JCas) FeatureExtractorResource_ImplBase(org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase) InstanceExtractor(org.dkpro.tc.core.task.uima.InstanceExtractor) Test(org.junit.Test)

Example 5 with InstanceExtractor

use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.

the class LibsvmDataFormatLoadModelConnector method createInputFile.

private File createInputFile(JCas jcas) throws Exception {
    File tempFile = FileUtil.createTempFile("libsvm", ".txt");
    tempFile.deleteOnExit();
    BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), "utf-8"));
    InstanceExtractor extractor = new InstanceExtractor(featureMode, featureExtractors, true);
    List<Instance> instances = extractor.getInstances(jcas, true);
    for (Instance instance : instances) {
        bw.write(OUTCOME_PLACEHOLDER);
        bw.write(injectSequenceId(instance));
        for (Feature f : instance.getFeatures()) {
            if (!sanityCheckValue(f)) {
                continue;
            }
            bw.write("\t");
            bw.write(featureMapping.get(f.getName()) + ":" + f.getValue());
        }
        bw.write("\n");
    }
    bw.close();
    return tempFile;
}
Also used : Instance(org.dkpro.tc.api.features.Instance) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) InstanceExtractor(org.dkpro.tc.core.task.uima.InstanceExtractor) Feature(org.dkpro.tc.api.features.Feature) BufferedWriter(java.io.BufferedWriter)

Aggregations

Instance (org.dkpro.tc.api.features.Instance)5 InstanceExtractor (org.dkpro.tc.core.task.uima.InstanceExtractor)5 JCas (org.apache.uima.jcas.JCas)3 FeatureExtractorResource_ImplBase (org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)3 Test (org.junit.Test)3 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileNotFoundException (java.io.FileNotFoundException)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1 ArrayList (java.util.ArrayList)1 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)1 Annotation (org.apache.uima.jcas.tcas.Annotation)1 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)1 Feature (org.dkpro.tc.api.features.Feature)1 TextClassificationOutcome (org.dkpro.tc.api.type.TextClassificationOutcome)1