use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class ExtractFeaturesConnector method getFeatureNames.
private void getFeatureNames(JCas jcas) throws AnalysisEngineProcessException {
// We run one time through feature extraction to get all features names
try {
List<Instance> instances = instanceExtractor.getInstances(jcas, false);
featureMeta.collectMetaData(instances);
featureMeta.writeMetaData(outputDirectory);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class ExtractFeaturesConnector method enforceMatchingFeatures.
private List<Instance> enforceMatchingFeatures(List<Instance> instances) {
if (!isTesting) {
return instances;
}
List<Instance> out = new ArrayList<>();
for (Instance i : instances) {
List<Feature> newFeatures = new ArrayList<>();
for (Feature feat : i.getFeatures()) {
if (!featureMeta.getFeatureNames().contains(feat.getName())) {
continue;
}
newFeatures.add(feat);
}
i.setFeatures(newFeatures);
out.add(i);
}
return out;
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class InstanceExtractor method getSequenceInstances.
public List<Instance> getSequenceInstances(JCas jcas, boolean useSparse) throws TextClassificationException {
List<Instance> instances = new ArrayList<Instance>();
int jcasId = JCasUtil.selectSingle(jcas, JCasId.class).getId();
int sequenceId = 0;
int unitId = 0;
Collection<TextClassificationSequence> sequences = JCasUtil.select(jcas, TextClassificationSequence.class);
for (TextClassificationSequence seq : sequences) {
unitId = 0;
List<TextClassificationTarget> seqTargets = JCasUtil.selectCovered(jcas, TextClassificationTarget.class, seq);
for (TextClassificationTarget aTarget : seqTargets) {
aTarget.setId(unitId++);
Instance instance = new Instance();
if (addInstanceId) {
instance.addFeature(InstanceIdFeature.retrieve(jcas, aTarget, sequenceId));
}
for (FeatureExtractorResource_ImplBase featExt : featureExtractors) {
if (useSparse) {
instance.addFeatures(getSparse(jcas, aTarget, featExt));
} else {
instance.addFeatures(getDense(jcas, aTarget, featExt));
}
}
// set and write outcome label(s)
instance.setOutcomes(getOutcomes(jcas, aTarget));
instance.setWeight(getWeight(jcas, aTarget));
instance.setJcasId(jcasId);
instance.setSequenceId(sequenceId);
instance.setSequencePosition(aTarget.getId());
instances.add(instance);
}
sequenceId++;
}
return instances;
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class InstanceExtractor method getInstances.
public List<Instance> getInstances(JCas aJCas, boolean extractSparse) throws AnalysisEngineProcessException {
List<Instance> extractedInstances = new ArrayList<>();
try {
if (isSequenceMode()) {
List<Instance> instances = getSequenceInstances(aJCas, extractSparse);
extractedInstances.addAll(instances);
} else if (isUnitMode()) {
List<Instance> instances = getUnitInstances(aJCas, extractSparse);
extractedInstances.addAll(instances);
} else {
Instance instance = getSingleInstance(aJCas, extractSparse);
extractedInstances.add(instance);
}
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
return extractedInstances;
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class TestTaskUtils method testInstanceMultiplicationWithUnitId.
@Test
public void testInstanceMultiplicationWithUnitId() throws Exception {
JCas jCas = initJCas(true);
FeatureExtractorResource_ImplBase[] featureExtractors = {};
InstanceExtractor ie = new InstanceExtractor(Constants.FM_SEQUENCE, featureExtractors, true);
List<Instance> multipleInstances = ie.getInstances(jCas, false);
assertEquals(6, multipleInstances.size());
int idx = 0;
assertEquals("4711_0_0_a", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 1;
assertEquals("4711_0_1_car", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 2;
assertEquals("4711_0_2_drives", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
idx = 3;
assertEquals("4711_1_0_the", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 4;
assertEquals("4711_1_1_hedgehogs", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 5;
assertEquals("4711_1_2_dies", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
Aggregations