use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class ExtractFeaturesConnectorTest method extractFeaturesConnectorMultiLabelTest.
@Test
public void extractFeaturesConnectorMultiLabelTest() throws Exception {
File outputPath = folder.newFolder();
// we do not need parameters here, but in case we do :)
Object[] parameters = new Object[] { NoopFeatureExtractor.PARAM_UNIQUE_EXTRACTOR_NAME, "123" };
ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(NoopFeatureExtractor.class, parameters);
List<ExternalResourceDescription> fes = new ArrayList<>();
fes.add(featureExtractor);
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderMultiLabel.class, TestReaderMultiLabel.PARAM_SOURCE_LOCATION, "src/test/resources/data/*.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT);
AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_REGRESSION, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
SimplePipeline.runPipeline(reader, segmenter, doc, featExtractorConnector);
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
List<Instance> instances = new ArrayList<>();
for (String l : lines) {
instances.add(gson.fromJson(l, Instance.class));
}
assertEquals(2, instances.size());
assertEquals(3, getUniqueOutcomes(instances));
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class InstanceExtractor method getUnitInstances.
public List<Instance> getUnitInstances(JCas jcas, boolean supportSparseFeatures) throws TextClassificationException {
List<Instance> instances = new ArrayList<Instance>();
int jcasId = JCasUtil.selectSingle(jcas, JCasId.class).getId();
Collection<TextClassificationTarget> targets = JCasUtil.select(jcas, TextClassificationTarget.class);
for (TextClassificationTarget aTarget : targets) {
Instance instance = new Instance();
if (addInstanceId) {
Feature feat = InstanceIdFeature.retrieve(jcas, aTarget);
instance.addFeature(feat);
}
for (FeatureExtractorResource_ImplBase featExt : featureExtractors) {
if (!(featExt instanceof FeatureExtractor)) {
throw new TextClassificationException("Feature extractor does not implement interface [" + FeatureExtractor.class.getName() + "]: " + featExt.getResourceName());
}
if (supportSparseFeatures) {
instance.addFeatures(getSparse(jcas, aTarget, featExt));
} else {
instance.addFeatures(getDense(jcas, aTarget, featExt));
}
}
// set and write outcome label(s)
instance.setOutcomes(getOutcomes(jcas, aTarget));
instance.setWeight(getWeight(jcas, aTarget));
instance.setJcasId(jcasId);
// instance.setSequenceId(sequenceId);
instance.setSequencePosition(aTarget.getId());
instances.add(instance);
}
return instances;
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class JsonDataWriter method writeClassifierFormat.
@Override
public void writeClassifierFormat(Collection<Instance> instances) throws AnalysisEngineProcessException {
try {
init();
Iterator<Instance> iterator = instances.iterator();
while (iterator.hasNext()) {
Instance next = iterator.next();
bw.write(gson.toJson(next) + "\n");
}
bw.close();
bw = null;
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class TestTaskUtils method testInstanceMultiplicationWithoutUnitId.
@Test
public void testInstanceMultiplicationWithoutUnitId() throws Exception {
JCas jCas = initJCas(false);
FeatureExtractorResource_ImplBase[] featureExtractors = {};
InstanceExtractor ie = new InstanceExtractor(Constants.FM_SEQUENCE, featureExtractors, true);
List<Instance> multipleInstances = ie.getInstances(jCas, false);
assertEquals(6, multipleInstances.size());
// Sequence 1
int idx = 0;
assertEquals("4711_0_0", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 1;
assertEquals("4711_0_1", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 2;
assertEquals("4711_0_2", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(0, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
// Sequence 2
idx = 3;
assertEquals("4711_1_0", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(0, multipleInstances.get(idx).getSequencePosition());
assertEquals("DT", multipleInstances.get(idx).getOutcome());
idx = 4;
assertEquals("4711_1_1", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(1, multipleInstances.get(idx).getSequencePosition());
assertEquals("NN", multipleInstances.get(idx).getOutcome());
idx = 5;
assertEquals("4711_1_2", multipleInstances.get(idx).getFeatures().iterator().next().getValue());
assertEquals(1, multipleInstances.get(idx).getSequenceId());
assertEquals(2, multipleInstances.get(idx).getSequencePosition());
assertEquals("VBZ", multipleInstances.get(idx).getOutcome());
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class WekaUtils method instanceListToArffFile.
/**
* Converts a feature store to a list of instances. Single-label case.
*
* @param outputFile
* the output file
* @param instanceList
* the instance list
* @param useDenseInstances
* use dense instances
* @param isRegressionExperiment
* is regression
* @param useWeights
* uses weight
* @throws Exception
* in case of error
*/
public static void instanceListToArffFile(File outputFile, List<Instance> instanceList, boolean useDenseInstances, boolean isRegressionExperiment, boolean useWeights) throws Exception {
List<String> outcomeList = new ArrayList<>();
for (Instance i : instanceList) {
outcomeList.add(i.getOutcome());
}
// check for error conditions
if (outcomeList.isEmpty()) {
throw new IllegalArgumentException("List of instance outcomes is empty.");
}
// Filter preprocessingFilter = new ReplaceMissingValuesWithZeroFilter();
AttributeStore attributeStore = WekaFeatureEncoder.getAttributeStore(instanceList);
// Make sure "outcome" is not the name of an attribute
Attribute outcomeAttribute = createOutcomeAttribute(outcomeList, isRegressionExperiment);
if (attributeStore.containsAttributeName(CLASS_ATTRIBUTE_NAME)) {
System.err.println("A feature with name \"outcome\" was found. Renaming outcome attribute");
outcomeAttribute = outcomeAttribute.copy(CLASS_ATTRIBUTE_PREFIX + CLASS_ATTRIBUTE_NAME);
}
attributeStore.addAttribute(outcomeAttribute.name(), outcomeAttribute);
Instances wekaInstances = new Instances(RELATION_NAME, attributeStore.getAttributes(), instanceList.size());
wekaInstances.setClass(outcomeAttribute);
if (!outputFile.exists()) {
outputFile.mkdirs();
outputFile.createNewFile();
}
ArffSaver saver = new ArffSaver();
// preprocessingFilter.setInputFormat(wekaInstances);
saver.setRetrieval(Saver.INCREMENTAL);
saver.setFile(outputFile);
saver.setCompressOutput(true);
saver.setInstances(wekaInstances);
for (int i = 0; i < instanceList.size(); i++) {
Instance instance = instanceList.get(i);
double[] featureValues = getFeatureValues(attributeStore, instance);
weka.core.Instance wekaInstance;
if (useDenseInstances) {
wekaInstance = new DenseInstance(1.0, featureValues);
} else {
wekaInstance = new SparseInstance(1.0, featureValues);
}
wekaInstance.setDataset(wekaInstances);
String outcome = outcomeList.get(i);
if (isRegressionExperiment) {
wekaInstance.setClassValue(Double.parseDouble(outcome));
} else {
wekaInstance.setClassValue(outcome);
}
Double instanceWeight = instance.getWeight();
if (useWeights) {
wekaInstance.setWeight(instanceWeight);
}
// preprocessingFilter.input(wekaInstance);
// saver.writeIncremental(preprocessingFilter.output());
saver.writeIncremental(wekaInstance);
}
// finishes the incremental saving process
saver.writeIncremental(null);
}
Aggregations