Search in sources :

Example 36 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class TestTaskUtils method testUnitModeInstanceNumbering.

// test numeration for unit mode i.e. no sequence
@Test
public void testUnitModeInstanceNumbering() throws Exception {
    JCas jCas = initJCas(true);
    FeatureExtractorResource_ImplBase[] featureExtractors = {};
    InstanceExtractor ie = new InstanceExtractor(Constants.FM_UNIT, featureExtractors, true);
    List<Instance> multipleInstances = ie.getInstances(jCas, false);
    assertEquals(6, multipleInstances.size());
    int idx = 0;
    assertEquals("4711_0_a", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(0, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 1;
    assertEquals("4711_1_car", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(1, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 2;
    assertEquals("4711_2_drives", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(2, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
    idx = 3;
    assertEquals("4711_3_the", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(3, multipleInstances.get(idx).getSequencePosition());
    assertEquals("DT", multipleInstances.get(idx).getOutcome());
    idx = 4;
    assertEquals("4711_4_hedgehogs", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(4, multipleInstances.get(idx).getSequencePosition());
    assertEquals("NN", multipleInstances.get(idx).getOutcome());
    idx = 5;
    assertEquals("4711_5_dies", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
    assertEquals(0, multipleInstances.get(idx).getSequenceId());
    assertEquals(5, multipleInstances.get(idx).getSequencePosition());
    assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
Also used : Instance(org.dkpro.tc.api.features.Instance) JCas(org.apache.uima.jcas.JCas) FeatureExtractorResource_ImplBase(org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase) InstanceExtractor(org.dkpro.tc.core.task.uima.InstanceExtractor) Test(org.junit.Test)

Example 37 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class UniformClassDistributionFilter method applyFilter.

@Override
public void applyFilter(File f) throws Exception {
    Map<String, List<Integer>> outcomeLineMap = new HashMap<>();
    Gson gson = new Gson();
    int lineId = 0;
    BufferedReader reader = null;
    String line = null;
    try {
        reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), "utf-8"));
        while ((line = reader.readLine()) != null) {
            Instance[] ins = gson.fromJson(line, Instance[].class);
            for (Instance i : ins) {
                List<Integer> list = outcomeLineMap.get(i.getOutcome());
                if (list == null) {
                    list = new ArrayList<>();
                }
                list.add(lineId++);
                outcomeLineMap.put(i.getOutcome(), list);
            }
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }
    // find the smallest class
    int minClassSize = Integer.MAX_VALUE;
    String minOutcome = null;
    for (Entry<String, List<Integer>> e : outcomeLineMap.entrySet()) {
        int size = e.getValue().size();
        if (size < minClassSize) {
            minClassSize = size;
            minOutcome = e.getKey();
        }
    }
    // shuffle the line-ids und shrink lists to minimal size
    for (Entry<String, List<Integer>> e : outcomeLineMap.entrySet()) {
        List<Integer> list = e.getValue();
        Collections.shuffle(list);
        outcomeLineMap.put(e.getKey(), list.subList(0, minClassSize));
    }
    File tmpOut = new File(f.getParentFile(), "json_filtered.txt");
    BufferedWriter writer = null;
    try {
        reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), "utf-8"));
        writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpOut), "utf-8"));
        line = null;
        lineId = 0;
        while ((line = reader.readLine()) != null) {
            Instance[] ins = gson.fromJson(line, Instance[].class);
            for (Instance i : ins) {
                // write the minimal class
                if (minOutcome.equals(i.getOutcome())) {
                    writer.write(line + "\n");
                    lineId++;
                    continue;
                }
                boolean write = outcomeLineMap.get(i.getOutcome()).contains(lineId);
                if (write) {
                    writer.write(line + "\n");
                }
            }
            lineId++;
        }
    } finally {
        IOUtils.closeQuietly(reader);
        IOUtils.closeQuietly(writer);
    }
    FileUtils.copyFile(tmpOut, f);
    FileUtils.deleteQuietly(tmpOut);
}
Also used : InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) Instance(org.dkpro.tc.api.features.Instance) Gson(com.google.gson.Gson) FileInputStream(java.io.FileInputStream) BufferedWriter(java.io.BufferedWriter) FileOutputStream(java.io.FileOutputStream) BufferedReader(java.io.BufferedReader) ArrayList(java.util.ArrayList) List(java.util.List) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File)

Example 38 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class InstanceTest method instanceInitializationByListTest.

@Test
public void instanceInitializationByListTest() throws Exception {
    Feature f1 = new Feature("feature1", "value1", FeatureType.STRING);
    Feature f2 = new Feature("feature2", "value1", FeatureType.STRING);
    List<Feature> features = new ArrayList<>();
    features.add(f1);
    features.add(f2);
    Instance instance = new Instance(features, "outcome");
    assertEquals(2, instance.getFeatures().size());
}
Also used : Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Feature(org.dkpro.tc.api.features.Feature) Test(org.junit.Test)

Example 39 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class InstanceTest method instanceInitializationBySetTest.

@Test
public void instanceInitializationBySetTest() throws Exception {
    Feature f1 = new Feature("feature1", "value1", FeatureType.STRING);
    Feature f2 = new Feature("feature2", "value1", FeatureType.STRING);
    Set<Feature> features = new HashSet<Feature>();
    features.add(f1);
    features.add(f2);
    Instance instance = new Instance(features, "outcome");
    assertEquals(2, instance.getFeatures().size());
}
Also used : Instance(org.dkpro.tc.api.features.Instance) Feature(org.dkpro.tc.api.features.Feature) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 40 with Instance

use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.

the class InstanceTest method instanceSetSeveralOutcomesTest.

@Test
public void instanceSetSeveralOutcomesTest() throws Exception {
    Feature f1 = new Feature("feature1", "value1", FeatureType.STRING);
    Feature f2 = new Feature("feature2", "value1", FeatureType.STRING);
    List<Feature> features = new ArrayList<>();
    features.add(f1);
    features.add(f2);
    Instance instance = new Instance(features, "outcome");
    List<String> newOutcomes = new ArrayList<String>();
    newOutcomes.add("outcome1");
    newOutcomes.add("outcome2");
    instance.setOutcomes(newOutcomes);
    assertEquals(2, instance.getOutcomes().size());
}
Also used : Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) Feature(org.dkpro.tc.api.features.Feature) Test(org.junit.Test)

Aggregations

Instance (org.dkpro.tc.api.features.Instance)61 ArrayList (java.util.ArrayList)38 Feature (org.dkpro.tc.api.features.Feature)30 Test (org.junit.Test)27 File (java.io.File)17 AnalysisEngineProcessException (org.apache.uima.analysis_engine.AnalysisEngineProcessException)14 IOException (java.io.IOException)12 Gson (com.google.gson.Gson)8 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)8 Attribute (weka.core.Attribute)8 DenseInstance (weka.core.DenseInstance)8 Instances (weka.core.Instances)8 SparseInstance (weka.core.SparseInstance)8 FeatureExtractorResource_ImplBase (org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase)6 BufferedReader (java.io.BufferedReader)5 FileInputStream (java.io.FileInputStream)5 InputStreamReader (java.io.InputStreamReader)5 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)5 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)5 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)5