Search in sources :

Example 1 with AggregateBuilder

use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.

the class PPipelineTestBase method runPipeline.

protected void runPipeline() throws Exception {
    List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, setTestPairsLocation());
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription stemmer = AnalysisEngineFactory.createEngineDescription(SnowballStemmer.class);
    AnalysisEngineDescription lemmatizer = AnalysisEngineFactory.createEngineDescription(MorphaLemmatizer.class);
    AnalysisEngineDescription posTagger = AnalysisEngineFactory.createEngineDescription(OpenNlpPosTagger.class);
    AnalysisEngineDescription pairAnno = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(pairAnno, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(pairAnno, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(stemmer, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(stemmer, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(lemmatizer, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(lemmatizer, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(posTagger, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(posTagger, Constants.INITIAL_VIEW, Constants.PART_TWO);
    getMetaCollector(parameterList);
    getFeatureExtractorCollector(parameterList);
    // run meta collector
    SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), metaCollector);
    // run FE(s)
    SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), featExtractorConnector);
    Gson gson = new Gson();
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    for (String l : lines) {
        instanceList.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(1, lines.size());
    assertEquals(1, getUniqueOutcomes(instanceList).size());
    featureNames = getFeatureNames(instanceList);
    for (int i = 0; i < instanceList.size(); i++) {
        outcomeList.add(instanceList.get(i).getOutcomes());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) Gson(com.google.gson.Gson) File(java.io.File)

Example 2 with AggregateBuilder

use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.

the class LuceneNGramCPMetaCollectorTest method combinedNgramPairMetaCollectorTest.

@Test
public void combinedNgramPairMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramCPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);
    // test fails if for-loop removed
    for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
    // System.out.println(jcas.getDocumentText().length());
    }
    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // then this would be relevant
                    if (text.utf8ToString().equals("mice_ANDcats_.")) {
                        assertEquals(1, termsEnum.docFreq());
                        assertEquals(1, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    assertEquals(65, i);
}
Also used : JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) Terms(org.apache.lucene.index.Terms) JCas(org.apache.uima.jcas.JCas) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TermsEnum(org.apache.lucene.index.TermsEnum) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 3 with AggregateBuilder

use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.

the class PPipelineTestBase method runPipeline.

protected void runPipeline() throws Exception {
    List<Object> parameterList = new ArrayList<Object>(Arrays.asList(parameters));
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, setTestPairsLocation());
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    getMetaCollector(parameterList);
    getFeatureExtractorCollector(parameterList);
    // run meta collector
    SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), metaCollector);
    // run FE(s)
    SimplePipeline.runPipeline(reader, builder.createAggregateDescription(), featExtractorConnector);
    Gson gson = new Gson();
    List<String> lines = FileUtils.readLines(new File(outputPath, JsonDataWriter.JSON_FILE_NAME), "utf-8");
    for (String l : lines) {
        instanceList.add(gson.fromJson(l, Instance.class));
    }
    assertEquals(1, lines.size());
    assertEquals(1, getUniqueOutcomes(instanceList).size());
    featureNames = getFeatureNames(instanceList);
    for (int i = 0; i < instanceList.size(); i++) {
        outcomeList.add(instanceList.get(i).getOutcomes());
    }
}
Also used : CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) Instance(org.dkpro.tc.api.features.Instance) ArrayList(java.util.ArrayList) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) Gson(com.google.gson.Gson) File(java.io.File)

Example 4 with AggregateBuilder

use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.

the class LuceneNGramPMetaCollectorTest method lucenePairNgramMetaCollectorTest.

@Test
public void lucenePairNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();
    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");
    AnalysisEngineDescription segmenter = AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);
    AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_PAIR);
    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);
    builder.add(doc, Constants.INITIAL_VIEW, Constants.PART_TWO);
    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneNGramPFE.PARAM_SOURCE_LOCATION, tmpDir, LuceneNGramPMetaCollector.PARAM_TARGET_LOCATION, tmpDir);
    // test fails if for-loop removed
    for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
    // System.out.println(jcas.getDocumentText().length());
    }
    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(WordNGram.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }
    assertEquals(16, i);
}
Also used : JCasIterable(org.apache.uima.fit.pipeline.JCasIterable) Terms(org.apache.lucene.index.Terms) JCas(org.apache.uima.jcas.JCas) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) TermsEnum(org.apache.lucene.index.TermsEnum) CollectionReaderDescription(org.apache.uima.collection.CollectionReaderDescription) Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 5 with AggregateBuilder

use of org.apache.uima.fit.factory.AggregateBuilder in project dkpro-tc by dkpro.

the class VectorizationTask method learningModeDependedVectorizationAnnotator.

private AnalysisEngineDescription learningModeDependedVectorizationAnnotator(File outputDir, File mappingDir) throws ResourceInitializationException {
    if (featureMode == null) {
        throw new ResourceInitializationException(new IllegalStateException("Learning model is [null]"));
    }
    AggregateBuilder builder = new AggregateBuilder();
    // records which document ids are in the train / test set (this is not
    // clear for cross-validation tasks)
    builder.add(createEngineDescription(IdentificationCollector.class, IdentificationCollector.PARAM_TARGET_DIRECTORY, outputDir, IdentificationCollector.PARAM_MODE, featureMode, IdentificationCollector.PARAM_USER_SET_MAXIMUM_LENGTH, maximumLength));
    AnalysisEngineDescription engine = null;
    switch(featureMode) {
        case Constants.FM_DOCUMENT:
            switch(learningMode) {
                case Constants.LM_REGRESSION:
                    engine = createEngineDescription(VectorizationDoc2Regression.class, VectorizationDoc2Regression.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDoc2Regression.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDoc2Regression.PARAM_TO_INTEGER, integerVectorization);
                    builder.add(engine);
                    break;
                case Constants.LM_SINGLE_LABEL:
                    engine = createEngineDescription(VectorizationDoc2SingleLabel.class, VectorizationDoc2SingleLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDoc2SingleLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDoc2SingleLabel.PARAM_TO_INTEGER, integerVectorization);
                    builder.add(engine);
                    break;
                case Constants.LM_MULTI_LABEL:
                    engine = createEngineDescription(VectorizationDocDoc2MultiLabel.class, VectorizationDocDoc2MultiLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationDocDoc2MultiLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDocDoc2MultiLabel.PARAM_TO_INTEGER, integerVectorization);
                    builder.add(engine);
                    break;
                default:
                    throw new ResourceInitializationException(new IllegalStateException("Combination of feature mode [" + featureMode + "] with learning mode [" + learningMode + "] not defined"));
            }
            break;
        case Constants.FM_SEQUENCE:
            engine = createEngineDescription(VectorizationSeq2SeqOfLabel.class, VectorizationSeq2SeqOfLabel.PARAM_TARGET_DIRECTORY, outputDir, VectorizationSeq2SeqOfLabel.PARAM_PREPARATION_DIRECTORY, mappingDir, VectorizationDocDoc2MultiLabel.PARAM_TO_INTEGER, integerVectorization);
            builder.add(engine);
            break;
        default:
            throw new ResourceInitializationException(new IllegalStateException("Combination of feature mode [" + featureMode + "] with learning mode [" + learningMode + "] not defined"));
    }
    return builder.createAggregateDescription();
}
Also used : IdentificationCollector(org.dkpro.tc.core.task.deep.anno.IdentificationCollector) VectorizationDoc2SingleLabel(org.dkpro.tc.core.task.deep.anno.VectorizationDoc2SingleLabel) AggregateBuilder(org.apache.uima.fit.factory.AggregateBuilder) ResourceInitializationException(org.apache.uima.resource.ResourceInitializationException) AnalysisEngineDescription(org.apache.uima.analysis_engine.AnalysisEngineDescription) VectorizationDoc2Regression(org.dkpro.tc.core.task.deep.anno.VectorizationDoc2Regression) VectorizationDocDoc2MultiLabel(org.dkpro.tc.core.task.deep.anno.VectorizationDocDoc2MultiLabel) VectorizationSeq2SeqOfLabel(org.dkpro.tc.core.task.deep.anno.VectorizationSeq2SeqOfLabel)

Aggregations

AggregateBuilder (org.apache.uima.fit.factory.AggregateBuilder)9 AnalysisEngineDescription (org.apache.uima.analysis_engine.AnalysisEngineDescription)8 File (java.io.File)6 CollectionReaderDescription (org.apache.uima.collection.CollectionReaderDescription)4 ResourceInitializationException (org.apache.uima.resource.ResourceInitializationException)4 ArrayList (java.util.ArrayList)3 Gson (com.google.gson.Gson)2 Fields (org.apache.lucene.index.Fields)2 IndexReader (org.apache.lucene.index.IndexReader)2 MultiFields (org.apache.lucene.index.MultiFields)2 Terms (org.apache.lucene.index.Terms)2 TermsEnum (org.apache.lucene.index.TermsEnum)2 BytesRef (org.apache.lucene.util.BytesRef)2 JCasIterable (org.apache.uima.fit.pipeline.JCasIterable)2 JCas (org.apache.uima.jcas.JCas)2 Instance (org.dkpro.tc.api.features.Instance)2 AssignIdConnector (org.dkpro.tc.core.task.uima.AssignIdConnector)2 Test (org.junit.Test)2 ExternalResourceDescription (org.apache.uima.resource.ExternalResourceDescription)1 TextClassificationException (org.dkpro.tc.api.exception.TextClassificationException)1