use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class WordNGramTest method luceneNGramFeatureExtractorNonDefaultFrequencyThresholdTest.
@Test
public void luceneNGramFeatureExtractorNonDefaultFrequencyThresholdTest() throws Exception {
File luceneFolder = folder.newFolder();
File outputPath = folder.newFolder();
Object[] parameters = new Object[] { WordNGram.PARAM_NGRAM_USE_TOP_K, "3", WordNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123", WordNGram.PARAM_SOURCE_LOCATION, luceneFolder.toString(), WordNGram.PARAM_NGRAM_FREQ_THRESHOLD, "0.1f", WordNGramMC.PARAM_TARGET_LOCATION, luceneFolder.toString() };
List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
CollectionReaderDescription reader = getMetaReader();
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(WordNGramMC.class, parameterList.toArray());
ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(WordNGram.class, parameters);
List<ExternalResourceDescription> fes = new ArrayList<>();
fes.add(featureExtractor);
AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_SINGLE_LABEL, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
// run meta collector
SimplePipeline.runPipeline(reader, segmenter, metaCollector);
// run FE(s)
SimplePipeline.runPipeline(reader, segmenter, featExtractorConnector);
List<Instance> instances = readInstances(outputPath);
assertEquals(4, instances.size());
assertEquals(1, getUniqueOutcomes(instances));
for (Instance i : instances) {
assertTrue(i.getFeatures().isEmpty());
}
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class LuceneMetaCollectionBasedFeatureTestBase method readInstances.
protected List<Instance> readInstances(File output) throws IOException {
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(output, JsonDataWriter.JSON_FILE_NAME), "utf-8");
List<Instance> instances = new ArrayList<>();
for (String l : lines) {
instances.add(gson.fromJson(l, Instance.class));
}
return instances;
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class NgramUnitTest method evaluateExtractedFeatures.
private void evaluateExtractedFeatures(File output) throws Exception {
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(output, JsonDataWriter.JSON_FILE_NAME), "utf-8");
List<Instance> instances = new ArrayList<>();
for (String l : lines) {
instances.add(gson.fromJson(l, Instance.class));
}
assertEquals(8, instances.size());
Iterator<Instance> iterator = instances.iterator();
int numFeatValueOne = 0;
int numFeatValuesZero = 0;
while (iterator.hasNext()) {
Instance next = iterator.next();
List<Feature> arrayList = new ArrayList<Feature>(next.getFeatures());
assertEquals(1, arrayList.size());
Object value = arrayList.get(0).getValue();
if ((double) value == 1.0) {
numFeatValueOne++;
} else if ((double) value == 0.0) {
numFeatValuesZero++;
} else {
throw new IllegalStateException("Value should either be 1.0 or .0.0 but was [" + value + "]");
}
}
assertEquals(2, numFeatValueOne);
assertEquals(6, numFeatValuesZero);
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class KeywordNGramTest method initialize.
private List<Instance> initialize(boolean includeComma, boolean markSentenceLocation) throws Exception {
File luceneFolder = folder.newFolder();
File outputPath = folder.newFolder();
Object[] parameters = new Object[] { KeywordNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123", KeywordNGram.PARAM_NGRAM_KEYWORDS_FILE, "src/test/resources/data/keywordlist.txt", KeywordNGram.PARAM_SOURCE_LOCATION, luceneFolder, KeywordNGramMC.PARAM_TARGET_LOCATION, luceneFolder, KeywordNGram.PARAM_KEYWORD_NGRAM_MARK_SENTENCE_LOCATION, markSentenceLocation, KeywordNGram.PARAM_KEYWORD_NGRAM_INCLUDE_COMMAS, includeComma };
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestReaderSingleLabelDocumentReader.class, TestReaderSingleLabelDocumentReader.PARAM_SOURCE_LOCATION, "src/test/resources/ngrams/trees.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(KeywordNGramMC.class, parameters);
ExternalResourceDescription featureExtractor = ExternalResourceFactory.createExternalResourceDescription(KeywordNGram.class, toString(parameters));
List<ExternalResourceDescription> fes = new ArrayList<>();
fes.add(featureExtractor);
AnalysisEngineDescription featExtractorConnector = TaskUtils.getFeatureExtractorConnector(outputPath.getAbsolutePath(), JsonDataWriter.class.getName(), Constants.LM_SINGLE_LABEL, Constants.FM_DOCUMENT, false, false, false, false, Collections.emptyList(), fes, new String[] {});
// run meta collector
SimplePipeline.runPipeline(reader, segmenter, metaCollector);
// run FE(s)
SimplePipeline.runPipeline(reader, segmenter, featExtractorConnector);
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
List<Instance> instances = new ArrayList<>();
for (String l : lines) {
instances.add(gson.fromJson(l, Instance.class));
}
assertEquals(1, instances.size());
return instances;
}
use of org.dkpro.tc.api.features.Instance in project dkpro-tc by dkpro.
the class LibsvmDataFormatWriter method writeGenericFormat.
@Override
public void writeGenericFormat(Collection<Instance> instances) throws AnalysisEngineProcessException {
try {
initGeneric();
// bulk-write - in sequence mode this keeps the instances together
// that belong to the same sequence!
Instance[] array = instances.toArray(new Instance[0]);
bw.write(gson.toJson(array) + "\n");
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
} finally {
IOUtils.closeQuietly(bw);
bw = null;
}
}
Aggregations