use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class TestTaskUtils method testUnitModeInstanceNumbering.
// test numeration for unit mode i.e. no sequence
@Test
public void testUnitModeInstanceNumbering() throws Exception {
JCas jCas = initJCas(true);
FeatureExtractorResource_ImplBase[] featureExtractors = {};
InstanceExtractor ie = new InstanceExtractor(Constants.FM_UNIT, featureExtractors, true);
List<Instance> multipleInstances = ie.getInstances(jCas, false);
assertEquals(6, multipleInstances.size());
int idx = 0;
assertEquals("4711_0_a", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 1;
assertEquals("4711_1_car", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 2;
assertEquals("4711_2_drives", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
idx = 3;
assertEquals("4711_3_the", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(3, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 4;
assertEquals("4711_4_hedgehogs", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(4, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 5;
assertEquals("4711_5_dies", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(5, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class UniformClassDistributionFilter method applyFilter.
@Override
public void applyFilter(File f) throws Exception {
Map<String, List<Integer>> outcomeLineMap = new HashMap<>();
Gson gson = new Gson();
int lineId = 0;
BufferedReader reader = null;
String line = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), "utf-8"));
while ((line = reader.readLine()) != null) {
Instance[] ins = gson.fromJson(line, Instance[].class);
for (Instance i : ins) {
List<Integer> list = outcomeLineMap.get(i.getOutcome());
if (list == null) {
list = new ArrayList<>();
}
list.add(lineId++);
outcomeLineMap.put(i.getOutcome(), list);
}
}
} finally {
IOUtils.closeQuietly(reader);
}
// find the smallest class
int minClassSize = Integer.MAX_VALUE;
String minOutcome = null;
for (Entry<String, List<Integer>> e : outcomeLineMap.entrySet()) {
int size = e.getValue().size();
if (size < minClassSize) {
minClassSize = size;
minOutcome = e.getKey();
}
}
// shuffle the line-ids und shrink lists to minimal size
for (Entry<String, List<Integer>> e : outcomeLineMap.entrySet()) {
List<Integer> list = e.getValue();
Collections.shuffle(list);
outcomeLineMap.put(e.getKey(), list.subList(0, minClassSize));
}
File tmpOut = new File(f.getParentFile(), "json_filtered.txt");
BufferedWriter writer = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), "utf-8"));
writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpOut), "utf-8"));
line = null;
lineId = 0;
while ((line = reader.readLine()) != null) {
Instance[] ins = gson.fromJson(line, Instance[].class);
for (Instance i : ins) {
// write the minimal class
if (minOutcome.equals(i.getOutcome())) {
writer.write(line + "\n");
lineId++;
continue;
}
boolean write = outcomeLineMap.get(i.getOutcome()).contains(lineId);
if (write) {
writer.write(line + "\n");
}
}
lineId++;
}
} finally {
IOUtils.closeQuietly(reader);
IOUtils.closeQuietly(writer);
}
FileUtils.copyFile(tmpOut, f);
FileUtils.deleteQuietly(tmpOut);
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class InstanceTest method instanceInitializationByListTest.
@Test
public void instanceInitializationByListTest() throws Exception {
Feature f1 = new Feature("feature1", "value1", FeatureType.STRING);
Feature f2 = new Feature("feature2", "value1", FeatureType.STRING);
List<Feature> features = new ArrayList<>();
features.add(f1);
features.add(f2);
Instance instance = new Instance(features, "outcome");
assertEquals(2, instance.getFeatures().size());
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class InstanceTest method instanceInitializationBySetTest.
@Test
public void instanceInitializationBySetTest() throws Exception {
Feature f1 = new Feature("feature1", "value1", FeatureType.STRING);
Feature f2 = new Feature("feature2", "value1", FeatureType.STRING);
Set<Feature> features = new HashSet<Feature>();
features.add(f1);
features.add(f2);
Instance instance = new Instance(features, "outcome");
assertEquals(2, instance.getFeatures().size());
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class InstanceTest method instanceSetSeveralOutcomesTest.
@Test
public void instanceSetSeveralOutcomesTest() throws Exception {
Feature f1 = new Feature("feature1", "value1", FeatureType.STRING);
Feature f2 = new Feature("feature2", "value1", FeatureType.STRING);
List<Feature> features = new ArrayList<>();
features.add(f1);
features.add(f2);
Instance instance = new Instance(features, "outcome");
List<String> newOutcomes = new ArrayList<String>();
newOutcomes.add("outcome1");
newOutcomes.add("outcome2");
instance.setOutcomes(newOutcomes);
assertEquals(2, instance.getOutcomes().size());
}
Aggregations