use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.
the class TestTaskUtils method testInstanceMultiplicationWithoutUnitId.
@Test
public void testInstanceMultiplicationWithoutUnitId() throws Exception {
JCas jCas = initJCas(false);
FeatureExtractorResource_ImplBase[] featureExtractors = {};
InstanceExtractor ie = new InstanceExtractor(Constants.FM_SEQUENCE, featureExtractors, true);
List<Instance> multipleInstances = ie.getInstances(jCas, false);
assertEquals(6, multipleInstances.size());
// Sequence 1
int idx = 0;
assertEquals("4711_0_0", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 1;
assertEquals("4711_0_1", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 2;
assertEquals("4711_0_2", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
// Sequence 2
idx = 3;
assertEquals("4711_1_0", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 4;
assertEquals("4711_1_1", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 5;
assertEquals("4711_1_2", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.
the class WekaLoadModelConnector method process.
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
Instance instance = null;
try {
InstanceExtractor extractor = new InstanceExtractor(featureMode, featureExtractors, false);
List<Instance> instances = extractor.getInstances(jcas, useSparse);
instance = instances.get(0);
} catch (Exception e1) {
throw new AnalysisEngineProcessException(e1);
}
boolean isMultiLabel = learningMode.equals(Constants.LM_MULTI_LABEL);
boolean isRegression = learningMode.equals(Constants.LM_REGRESSION);
if (!isMultiLabel) {
// single-label
weka.core.Instance wekaInstance = null;
try {
wekaInstance = WekaUtils.tcInstanceToWekaInstance(instance, trainingData, classLabels, isRegression);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
Object val = null;
try {
if (!isRegression) {
val = classLabels.get((int) cls.classifyInstance(wekaInstance));
} else {
val = cls.classifyInstance(wekaInstance);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
TextClassificationOutcome outcome = getOutcome(jcas);
outcome.setOutcome(val.toString());
} else {
// multi-label
weka.core.Instance mekaInstance = null;
try {
mekaInstance = WekaUtils.tcInstanceToMekaInstance(instance, trainingData, classLabels);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
double[] vals = null;
try {
vals = cls.distributionForInstance(mekaInstance);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
List<String> outcomes = new ArrayList<String>();
for (int i = 0; i < vals.length; i++) {
if (vals[i] >= Double.valueOf(bipartitionThreshold)) {
String label = mekaInstance.attribute(i).name().split(WekaDataWriter.CLASS_ATTRIBUTE_PREFIX)[1];
outcomes.add(label);
}
}
// TextClassificationFocus focus = null;
if (FM_DOCUMENT.equals(featureMode) || FM_PAIR.equals(featureMode)) {
Collection<TextClassificationOutcome> oldOutcomes = JCasUtil.select(jcas, TextClassificationOutcome.class);
List<Annotation> annotationsList = new ArrayList<Annotation>();
for (TextClassificationOutcome oldOutcome : oldOutcomes) {
annotationsList.add(oldOutcome);
}
for (Annotation annotation : annotationsList) {
annotation.removeFromIndexes();
}
} else {
TextClassificationOutcome annotation = getOutcome(jcas);
annotation.removeFromIndexes();
// focus = JCasUtil.selectSingle(jcas, TextClassificationFocus.class);
}
if (outcomes.size() > 0) {
TextClassificationOutcome newOutcome = new TextClassificationOutcome(jcas);
newOutcome.setOutcome(outcomes.get(0));
newOutcome.addToIndexes();
}
if (outcomes.size() > 1) {
// add more outcome annotations
try {
for (int i = 1; i < outcomes.size(); i++) {
TextClassificationOutcome newOutcome = new TextClassificationOutcome(jcas);
newOutcome.setOutcome(outcomes.get(i));
newOutcome.addToIndexes();
}
} catch (Exception ex) {
String msg = "Error while trying to retrieve TC focus from CAS. Details: " + ex.getMessage();
Logger.getLogger(getClass()).error(msg, ex);
throw new RuntimeException(msg, ex);
}
}
}
}
use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.
the class TestTaskUtils method testInstanceMultiplicationWithUnitId.
@Test
public void testInstanceMultiplicationWithUnitId() throws Exception {
JCas jCas = initJCas(true);
FeatureExtractorResource_ImplBase[] featureExtractors = {};
InstanceExtractor ie = new InstanceExtractor(Constants.FM_SEQUENCE, featureExtractors, true);
List<Instance> multipleInstances = ie.getInstances(jCas, false);
assertEquals(6, multipleInstances.size());
int idx = 0;
assertEquals("4711_0_0_a", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 1;
assertEquals("4711_0_1_car", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 2;
assertEquals("4711_0_2_drives", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
idx = 3;
assertEquals("4711_1_0_the", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 4;
assertEquals("4711_1_1_hedgehogs", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 5;
assertEquals("4711_1_2_dies", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.
the class TestTaskUtils method testUnitModeInstanceNumbering.
// test numeration for unit mode i.e. no sequence
@Test
public void testUnitModeInstanceNumbering() throws Exception {
JCas jCas = initJCas(true);
FeatureExtractorResource_ImplBase[] featureExtractors = {};
InstanceExtractor ie = new InstanceExtractor(Constants.FM_UNIT, featureExtractors, true);
List<Instance> multipleInstances = ie.getInstances(jCas, false);
assertEquals(6, multipleInstances.size());
int idx = 0;
assertEquals("4711_0_a", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 1;
assertEquals("4711_1_car", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 2;
assertEquals("4711_2_drives", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
idx = 3;
assertEquals("4711_3_the", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(3, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 4;
assertEquals("4711_4_hedgehogs", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(4, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 5;
assertEquals("4711_5_dies", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(5, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
use of org.dkpro.tc.core.task.uima.InstanceExtractor in project dkpro-tc by dkpro.
the class LibsvmDataFormatLoadModelConnector method createInputFile.
private File createInputFile(JCas jcas) throws Exception {
File tempFile = FileUtil.createTempFile("libsvm", ".txt");
tempFile.deleteOnExit();
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), "utf-8"));
InstanceExtractor extractor = new InstanceExtractor(featureMode, featureExtractors, true);
List<Instance> instances = extractor.getInstances(jcas, true);
for (Instance instance : instances) {
bw.write(OUTCOME_PLACEHOLDER);
bw.write(injectSequenceId(instance));
for (Feature f : instance.getFeatures()) {
if (!sanityCheckValue(f)) {
continue;
}
bw.write("\t");
bw.write(featureMapping.get(f.getName()) + ":" + f.getValue());
}
bw.write("\n");
}
bw.close();
return tempFile;
}
Aggregations