use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.
the class PPipelineTestBase method runPipeline.
protected void runPipeline() throws Exception {
List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, setTestPairsLocation());
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription stemmer = AnalysisEngineFactory.createEngineDescription(SnowballStemmer.class);
AnalysisEngineDescription lemmatizer = AnalysisEngineFactory.createEngineDescription(MorphaLemmatizer.class);
AnalysisEngineDescription posTagger = AnalysisEngineFactory.createEngineDescription(OpenNlpPosTagger.class);
AnalysisEngineDescription pairAnno = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
AggregateBuilder builder = new AggregateBuilder();
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(pairAnno, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(pairAnno, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(stemmer, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(stemmer, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(lemmatizer, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(lemmatizer, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(posTagger, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(posTagger, Constants.INITIAL_VIEW, Constants.PART_TWO);
getMetaCollector(parameterList);
getFeatureExtractorCollector(parameterList);
// run meta collector
SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), metaCollector);
// run FE(s)
SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), featExtractorConnector);
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
for (String l : lines) {
instanceList.add(gson.fromJson(l, Instance.class));
}
assertEquals(1, lines.size());
assertEquals(1, getUniqueOutcomes(instanceList).size());
featureNames = getFeatureNames(instanceList);
for (int i = 0; i < instanceList.size(); i++) {
outcomeList.add(instanceList.get(i).getOutcomes());
}
}
use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.
the class LuceneNGramCPMetaCollectorTest method combinedNgramPairMetaCollectorTest.
@Test
public void combinedNgramPairMetaCollectorTest() throws Exception {
File tmpDir = folder.newFolder();
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
AggregateBuilder builder = new AggregateBuilder();
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramCPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);
// test fails if for-loop removed
for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
// System.out.println(jcas.getDocumentText().length());
}
int i = 0;
IndexReader index;
try {
index = DirectoryReader.open(FSDirectory.open(tmpDir));
Fields fields = MultiFields.getFields(index);
if (fields != null) {
Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
// then this would be relevant
if (text.utf8ToString().equals("mice_ANDcats_.")) {
assertEquals(1, termsEnum.docFreq());
assertEquals(1, termsEnum.totalTermFreq());
}
i++;
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
assertEquals(65, i);
}
use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.
the class PPipelineTestBase method runPipeline.
protected void runPipeline() throws Exception {
List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, setTestPairsLocation());
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AggregateBuilder builder = new AggregateBuilder();
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
getMetaCollector(parameterList);
getFeatureExtractorCollector(parameterList);
// run meta collector
SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), metaCollector);
// run FE(s)
SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), featExtractorConnector);
Gson gson = new Gson();
List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
for (String l : lines) {
instanceList.add(gson.fromJson(l, Instance.class));
}
assertEquals(1, lines.size());
assertEquals(1, getUniqueOutcomes(instanceList).size());
featureNames = getFeatureNames(instanceList);
for (int i = 0; i < instanceList.size(); i++) {
outcomeList.add(instanceList.get(i).getOutcomes());
}
}
use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.
the class LuceneNGramPMetaCollectorTest method lucenePairNgramMetaCollectorTest.
@Test
public void lucenePairNgramMetaCollectorTest() throws Exception {
File tmpDir = folder.newFolder();
CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");
AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
AggregateBuilder builder = new AggregateBuilder();
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);
AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);
// test fails if for-loop removed
for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
// System.out.println(jcas.getDocumentText().length());
}
int i = 0;
IndexReader index;
try {
index = DirectoryReader.open(FSDirectory.open(tmpDir));
Fields fields = MultiFields.getFields(index);
if (fields != null) {
Terms terms = fields.terms(WordNGram.LUCENE_NGRAM_FIELD);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
if (text.utf8ToString().equals("this")) {
assertEquals(2, termsEnum.docFreq());
assertEquals(3, termsEnum.totalTermFreq());
}
i++;
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
assertEquals(16, i);
}
use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.
the class VectorizationTask method learningModeDependedVectorizationAnnotator.
private AnalysisEngineDescription learningModeDependedVectorizationAnnotator(File outputDir, File mappingDir) throws ResourceInitializationException {
if (featureMode == null) {
throw new ResourceInitializationException(new IllegalStateException("Learning model is [null]"));
}
AggregateBuilder builder = new AggregateBuilder();
// records which document ids are in the train / test set (this is not
// clear for cross-validation tasks)
builder.add(createEngineDescription(IdentificationCollector.class, IdentificationCollector.PARAM_TARGET_DIRECTORY, outputDir, IdentificationCollector.PARAM_MODE, featureMode, IdentificationCollector.PARAM_USER_SET_MAXIMUM_LENGTH, maximumLength));
AnalysisEngineDescription engine = null;
switch(featureMode) {
case Constants.FM_DOCUMENT:
switch(learningMode) {
case Constants.LM_REGRESSION:
engine = createEngineDescription(VectorizationDoc2Regression.class, VectorizationDoc2Regression.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDoc2Regression.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDoc2Regression.PARAM_TO_INTEGER, integerVectorization);
builder.add(engine);
break;
case Constants.LM_SINGLE_LABEL:
engine = createEngineDescription(VectorizationDoc2SingleLabel.class, VectorizationDoc2SingleLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDoc2SingleLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDoc2SingleLabel.PARAM_TO_INTEGER, integerVectorization);
builder.add(engine);
break;
case Constants.LM_MULTI_LABEL:
engine = createEngineDescription(VectorizationDocDoc2MultiLabel.class, VectorizationDocDoc2MultiLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDocDoc2MultiLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDocDoc2MultiLabel.PARAM_TO_INTEGER, integerVectorization);
builder.add(engine);
break;
default:
throw new ResourceInitializationException(new IllegalStateException("Combination of feature mode [" + featureMode + "] with learning mode [" + learningMode + "] not defined"));
}
break;
case Constants.FM_SEQUENCE:
engine = createEngineDescription(VectorizationSeq2SeqOfLabel.class, VectorizationSeq2SeqOfLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationSeq2SeqOfLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDocDoc2MultiLabel.PARAM_TO_INTEGER, integerVectorization);
builder.add(engine);
break;
default:
throw new ResourceInitializationException(new IllegalStateException("Combination of feature mode [" + featureMode + "] with learning mode [" + learningMode + "] not defined"));
}
return builder.createAggregateDescription();
}
Aggregations