Search in sources :

Example 31 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class ExtractFeaturesConnector method getFeatureNames.

private void getFeatureNames(JCas jcas) throws AnalysisEngineProcessException {
    // We run one time through feature extraction to get all features names
    try {
        List<Instance> instances = instanceExtractor.getInstances(jcas, false);
        featureMeta.collectMetaData(instances);
        featureMeta.writeMetaData(outputDirectory);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
}
Also used : Instance(org.dkpro.tc.api.features.Instance) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) IOException(java.io.IOException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException)

Example 32 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class ExtractFeaturesConnector method enforceMatchingFeatures.

private List<Instance> enforceMatchingFeatures(List<Instance> instances) {
    if (!isTesting) {
        return instances;
    }
    List<Instance> out = new ArrayList<>();
    for (Instance i : instances) {
        List<Feature> newFeatures = new ArrayList<>();
        for (Feature feat : i.getFeatures()) {
            if (!featureMeta.getFeatureNames().contains(feat.getName())) {
                continue;
            }
            newFeatures.add(feat);
        }
        i.setFeatures(newFeatures);
        out.add(i);
    }
    return out;
}
Also used : Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Feature(org.dkpro.tc.api.features.Feature)

Example 33 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class InstanceExtractor method getSequenceInstances.

public List<Instance> getSequenceInstances(JCas jcas, boolean useSparse) throws TextClassificationException {
    List<Instance> instances = new ArrayList<Instance>();
    int jcasId = JCasUtil.selectSingle(jcas, JCasId.class).getId();
    int sequenceId = 0;
    int unitId = 0;
    Collection<TextClassificationSequence> sequences = JCasUtil.select(jcas, TextClassificationSequence.class);
    for (TextClassificationSequence seq : sequences) {
        unitId = 0;
        List<TextClassificationTarget> seqTargets = JCasUtil.selectCovered(jcas, TextClassificationTarget.class, seq);
        for (TextClassificationTarget aTarget : seqTargets) {
            aTarget.setId(unitId++);
            Instance instance = new Instance();
            if (addInstanceId) {
                instance.addFeature(InstanceIdFeature.retrieve(jcas, aTarget, sequenceId));
            }
            for (FeatureExtractorResource_ImplBase featExt : featureExtractors) {
                if (useSparse) {
                    instance.addFeatures(getSparse(jcas, aTarget, featExt));
                } else {
                    instance.addFeatures(getDense(jcas, aTarget, featExt));
                }
            }
            // set and write outcome label(s)
            instance.setOutcomes(getOutcomes(jcas, aTarget));
            instance.setWeight(getWeight(jcas, aTarget));
            instance.setJcasId(jcasId);
            instance.setSequenceId(sequenceId);
            instance.setSequencePosition(aTarget.getId());
            instances.add(instance);
        }
        sequenceId++;
    }
    return instances;
}
Also used : JCasId(org.dkpro.tc.api.type.JCasId) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) TextClassificationTarget(org.dkpro.tc.api.type.TextClassificationTarget) TextClassificationSequence(org.dkpro.tc.api.type.TextClassificationSequence) FeatureExtractorResource_ImplBase(org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)

Example 34 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class InstanceExtractor method getInstances.

public List<Instance> getInstances(JCas aJCas, boolean extractSparse) throws AnalysisEngineProcessException {
    List<Instance> extractedInstances = new ArrayList<>();
    try {
        if (isSequenceMode()) {
            List<Instance> instances = getSequenceInstances(aJCas, extractSparse);
            extractedInstances.addAll(instances);
        } else if (isUnitMode()) {
            List<Instance> instances = getUnitInstances(aJCas, extractSparse);
            extractedInstances.addAll(instances);
        } else {
            Instance instance = getSingleInstance(aJCas, extractSparse);
            extractedInstances.add(instance);
        }
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
    return extractedInstances;
}
Also used : Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException) TextClassificationException(org.dkpro.tc.api.exception.TextClassificationException) CASException(org.apache.uima.cas.CASException) AnalysisEngineProcessException(org.apache.uima.analysis_engine.AnalysisEngineProcessException)

Example 35 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class TestTaskUtils method testInstanceMultiplicationWithUnitId.

@Test
public void testInstanceMultiplicationWithUnitId() throws Exception {
    JCas jCas = initJCas(true);
    FeatureExtractorResource_ImplBase[] featureExtractors = {};
    InstanceExtractor ie = new InstanceExtractor(Constants.FM_SEQUENCE, featureExtractors, true);
    List<Instance> multipleInstances = ie.getInstances(jCas, false);
    assertEquals(6, multipleInstances.size());
    int idx = 0;
    assertEquals("4711_0_0_a", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(0, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 1;
    assertEquals("4711_0_1_car", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(1, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 2;
    assertEquals("4711_0_2_drives", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(2, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
    idx = 3;
    assertEquals("4711_1_0_the", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(0, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 4;
    assertEquals("4711_1_1_hedgehogs", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(1, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 5;
    assertEquals("4711_1_2_dies", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(1, multipleInstances.get(idx).getSequenceId());
    assertEquals(2, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
Also used : Instance(org.dkpro.tc.api.features.Instance) JCas(org.apache.uima.jcas.JCas) FeatureExtractorResource_ImplBase(org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase) InstanceExtractor(org.dkpro.tc.core.task.uima.InstanceExtractor) Test(org.junit.Test)

Aggregations

Instance (org.dkpro.tc.api.features.Instance)61 ArrayList (java.util.ArrayList)38 Feature (org.dkpro.tc.api.features.Feature)30 Test (org.junit.Test)27 File (java.io.File)17 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)14 IOException (java.io.IOException)12 Gson (com.google.gson.Gson)8 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)8 Attribute (weka.core.Attribute)8 DenseInstance (weka.core.DenseInstance)8 Instances (weka.core.Instances)8 SparseInstance (weka.core.SparseInstance)8 FeatureExtractorResource_ImplBase (org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)6 BufferedReader (java.io.BufferedReader)5 FileInputStream (java.io.FileInputStream)5 InputStreamReader (java.io.InputStreamReader)5 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)5 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)5 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)5